In [8]:
import pickle
import os
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
import multiprocessing
from copy import deepcopy
from sklearn.metrics import roc_auc_score    
from sklearn.cross_validation import StratifiedKFold

In [2]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
df = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/C_article_replication_and_translational_classification/df.p', 'rb'))

In [9]:
print df.shape
print df.columns.values
print df['label'].unique()

(354, 6)
['pmid' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms' 'label']
['T0' 'T1/T2' 'T3/T4' 'TX']


In [5]:
df[0:10]

Unnamed: 0,pmid,title,abstract,qualifier_terms,descriptor_terms,label
0,23195993,Gene expression profiles in peripheral blood m...,Occupational exposure to nickel (Ni) is associ...,genetics metabolism epidemiology drug effects ...,Adult Asian Continental Ancestry Group Biomark...,T0
1,25077433,SOX2 is a cancer-specific regulator of tumour ...,Although the principles that balance stem cell...,genetics metabolism pathology metabolism patho...,"Animals Carcinoma, Squamous Cell Cell Line, Tu...",T0
2,24107601,Imaging and cerebrospinal fluid biomarkers in ...,The pathophysiological process of Alzheimer's ...,cerebrospinal fluid genetics radionuclide imag...,"Aged Aged, 80 and over Alzheimer Disease Amylo...",T0
3,24891029,Preliminary evidence of cognitive and brain ab...,To ascertain whether pediatric obesity without...,pathology physiopathology physiopathology,Adolescent Attention Brain Cognition Executive...,T0
4,21691448,Obese Adolescents with Type 2 Diabetes Mellitu...,The rates of type 2 diabetes (T2DM) continue t...,,,T0
5,22765222,Diversity of 5S rRNA genes within individual p...,We examined intragenomic variation of paralogo...,chemistry classification genetics chemistry ge...,"Bacteria DNA, Ribosomal Databases, Nucleic Aci...",T0
6,24401686,Myoinositol and glutamate complex neurometabol...,To obtain quantitative neurometabolite measure...,analogs & derivatives metabolism metabolism me...,Adult Aspartic Acid Brain Injuries Case-Contro...,T0
7,22914093,Antibiotics in early life alter the murine col...,Antibiotics administered in low doses have bee...,drug effects physiology administration & dosag...,Adiposity Age Factors Animals Anti-Bacterial A...,T0
8,23426830,Elevated serum anti-Müllerian hormone in adole...,Serum anti-Müllerian hormone (AMH) is linked t...,blood pathology ultrasonography blood ultrason...,Adolescent Anti-Mullerian Hormone Child Female...,T0
9,24344399,Association of obesity-mediated insulin resist...,The hypothalamus is important in hunger and me...,blood analysis anatomy & histology blood blood...,Adolescent Adult Brain-Derived Neurotrophic Fa...,T0


## Apply word2vec inversion

In [12]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

## Original Method

In [10]:
#add boolean labels
df['boolean_label'] = 1
for row in df.iterrows():
    if row[1]['label'] != 'T0':
        df.set_value(row[0], 'boolean_label', 0)

In [13]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

#include mesh terms in bag of words?
mesh_terms = True
#remove stopwords
stops = False

i = 1
avg = []
for train_index, test_index in skf:
    
    print("ROUND", i)
    i = i + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]

    # ****** Split the training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    #here change to include all journal name labels of positive and negative
    inxs_pos = np.where(train1['boolean_label'] == 1)[0].tolist()
    inxs_neg = np.where(train1['boolean_label'] == 0)[0].tolist()

    print "Parsing sentences from training set"
    for inx in inxs_pos:
        review = train1["abstract"].iloc[inx]
        title = train1['title'].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[inx]
            qualifiers = train1['qualifier_terms'].iloc[inx]
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)


    for inx in inxs_neg:
        review = train1["abstract"].iloc[inx]
        title = train1['title'].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[inx]
            qualifiers = train1['qualifier_terms'].iloc[inx]
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)

    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    for j in range(len(train1)):
        review = train1["abstract"].iloc[j]
        title = train1['title'].iloc[j]
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[j]
            qualifiers = train1['qualifier_terms'].iloc[j]
            sentences += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)

    print "Building and training w2v models"
    ## create a w2v learner 
    basemodel = Word2Vec(
        workers=multiprocessing.cpu_count(), # use your cores
        iter=3, # iter = sweeps of SGD through the data; more is better
        hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )
    basemodel.build_vocab(sentences) 

    #train models
    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )

    print "Parsing test sentences"
    # read in the test set as a list of a list of words

    docs = []
    for k in range(len(test1)):
        sents = []
        review = test1["abstract"].iloc[k]
        title = test1['title'].iloc[k]
        sents += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sents += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = test1['descriptor_terms'].iloc[k]
            qualifiers = test1['qualifier_terms'].iloc[k]
            sents += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sents += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)
        docs.append(sents)

    print "scoring test set"
    # get the probs (note we give docprob our test set plus the models)
    #probs = docprob(docs,models).astype(object)
    probssentences = docprob(docs,models)

    predictions = np.ones((probs.shape[0]))

    predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model
    print predictions
    print np.size(predictions)

    score = roc_auc_score(test1["boolean_label"], predictions)
    #score = np.size(np.where(predictions == test1["sentiment"]))*1./np.size(predictions)
    print score
    #append to average
    avg.append(score)


print("average of 5 rotations:", sum(avg)/float(len(avg)))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from training set


NameError: name 'KaggleWord2VecUtility' is not defined

## New Combined Method (only has to be run once instead of 3 times)

In [11]:
def buildModels(train1, test1, docs, mesh_terms, stops, basemodel, label):
    
    #add boolean labels
    train1['boolean_label'] = 1
    for row in train1.iterrows():
        if row[1]['label'] != label:
            train1.set_value(row[0], 'boolean_label', 0)
            
    test1['boolean_label'] = 1
    for row in test1.iterrows():
        if row[1]['label'] != label:
            test1.set_value(row[0], 'boolean_label', 0)
    
    # ****** Split the training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    #here change to include all journal name labels of positive and negative
    inxs_pos = np.where(train1['boolean_label'] == 1)[0].tolist()
    inxs_neg = np.where(train1['boolean_label'] == 0)[0].tolist()

    for inx in inxs_pos:
        review = train1["abstract"].iloc[inx]
        title = train1['title'].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[inx]
            qualifiers = train1['qualifier_terms'].iloc[inx]
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)


    for inx in inxs_neg:
        review = train1["abstract"].iloc[inx]
        title = train1['title'].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[inx]
            qualifiers = train1['qualifier_terms'].iloc[inx]
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)
         
    #train models
    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )
    
    probs = docprob(docs,models).astype(object)
    
    predictions = np.ones((probs.shape[0]))

    predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model

    score = roc_auc_score(test1["boolean_label"], predictions)
    
    return score

In [14]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

#include mesh terms in bag of words?
mesh_terms = True
#remove stopwords?
stops = False

i = 1
avgT0 = []
avgT1T2 = []
avgT3T4 = []
for train_index, test_index in skf:

    print("ROUND", i)
    i = i + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]
    
    print "Parsing sentences from training set"

    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    for j in range(len(train1)):
        review = train1["abstract"].iloc[j]
        title = train1['title'].iloc[j]
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sentences += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = train1['descriptor_terms'].iloc[j]
            qualifiers = train1['qualifier_terms'].iloc[j]
            sentences += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sentences += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)

    print "Building and training w2v models"
    ## create a w2v learner 
    basemodel = Word2Vec(
        workers=multiprocessing.cpu_count(), # use your cores
        iter=3, # iter = sweeps of SGD through the data; more is better
        hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )
    basemodel.build_vocab(sentences) 

    print "Parsing test sentences"
    # read in the test set as a list of a list of words

    docs = []
    for k in range(len(test1)):
        sents = []
        review = test1["abstract"].iloc[k]
        title = test1['title'].iloc[k]
        sents += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=stops)
        sents += KaggleWord2VecUtility.review_to_sentences(title, tokenizer, remove_stopwords=stops)
        if mesh_terms:
            descriptors = test1['descriptor_terms'].iloc[k]
            qualifiers = test1['qualifier_terms'].iloc[k]
            sents += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer, remove_stopwords=stops)
            sents += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer, remove_stopwords=stops)
        docs.append(sents)

    print "Scoring test set"
    scoreT0 = buildModels(train1, test1, docs, mesh_terms, stops, basemodel, 'T0')
    print scoreT0
    avgT0.append(scoreT0)
    
    scoreT1T2 = buildModels(train1, test1, docs, mesh_terms, stops, basemodel, 'T1/T2')
    print scoreT1T2
    avgT1T2.append(scoreT1T2)
    
    scoreT3T4 = buildModels(train1, test1, docs, mesh_terms, stops, basemodel, 'T3/T4')
    print scoreT3T4
    avgT3T4.append(scoreT3T4)


print sum(avgT0) / float(len(avgT0))
print sum(avgT1T2) / float(len(avgT1T2))
print sum(avgT3T4) / float(len(avgT3T4))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
Scoring test set
Parsing sentences from training set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training models
Scoring test set
0.600233100233
Parsing sentences from training set
Training models
Scoring test set
0.472906403941
Parsing sentences from training set
Training models
Scoring test set
0.518867924528
('ROUND', 2)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
Scoring test set
Parsing sentences from training set
Training models
Scoring test set
0.576923076923
Parsing sentences from training set
Training models
Scoring test set
0.508620689655
Parsing sentences from training set
Training models
Scoring test set
0.509433962264
('ROUND', 3)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
Scoring test set
Parsing sentences from training set
Training models
Scoring test set
0.586939102564
Parsing sentences from training set
Training models
Scoring test set
0.5
Parsing sentences from training set
Training models
Scoring test set
0.519230769231
('ROUND', 4)
Parsing sentences from training se

## Defined Functions Method

In [30]:
def stratified_5_fold_cv(df, mesh_terms, label):
    #5-fold stratified cross validation

    #add boolean labels
    df['boolean_label'] = 1
    for row in df.iterrows():
        if row[1]['label'] != label:
            df.set_value(row[0], 'boolean_label', 0)

    #because no validation set, 4/5 of values go to train and 1/5 to test
    #is this too high?***
    #even though we shuffle, not as randomly distributed as the former method was
    y = df['boolean_label'].values
    skf = StratifiedKFold(y, n_folds=5, shuffle=True)

    i = 1
    avg = []
    for train_index, test_index in skf:
        print("ROUND", i)
        i = i + 1
        
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[train_index], df.iloc[test_index]

        # ****** Split the training set into clean sentences
        #
        sentences_pos = []  # Initialize an empty list of sentences
        sentences_neg = []  # Initialize an empty list of sentences

        #here change to include all journal name labels of positive and negative
        inxs_pos = np.where(train1['boolean_label'] == 1)[0].tolist()
        inxs_neg = np.where(train1['boolean_label'] == 0)[0].tolist()

        print "Parsing sentences from training set"
        for inx in inxs_pos:
            review = train1["abstract"].iloc[inx]
            title = train1['title'].iloc[inx]
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(title, tokenizer)
            if mesh_terms:
                descriptors = train1['descriptor_terms'].iloc[inx]
                qualifiers = train1['qualifier_terms'].iloc[inx]
                sentences_pos += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer)
                sentences_pos += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer)

        for inx in inxs_neg:
            review = train1["abstract"].iloc[inx]
            title = train1['title'].iloc[inx]
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(title, tokenizer)
            if mesh_terms:
                descriptors = train1['descriptor_terms'].iloc[inx]
                qualifiers = train1['qualifier_terms'].iloc[inx]
                sentences_neg += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer)
                sentences_neg += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer)

        # ****** Split the labeled and unlabeled training sets into clean sentences
        #
        sentences = []  # Initialize an empty list of sentences

        for j in range(len(train1)):
            review = train1["abstract"].iloc[j]
            title = train1['title'].iloc[j]
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            sentences += KaggleWord2VecUtility.review_to_sentences(title, tokenizer)
            if mesh_terms:
                descriptors = train1['descriptor_terms'].iloc[j]
                qualifiers = train1['qualifier_terms'].iloc[j]
                sentences += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer)
                sentences += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer)

        print "Building and training w2v models"
        ## create a w2v learner 
        basemodel = Word2Vec(
            workers=multiprocessing.cpu_count(), # use your cores
            iter=3, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
            )
        basemodel.build_vocab(sentences) 

        #train models
        models = [deepcopy(basemodel) for y in range(2)]
        models[0].train(sentences_neg, total_examples=len(sentences_neg) )
        models[1].train(sentences_pos, total_examples=len(sentences_pos) )

        print "Parsing test sentences"
        # read in the test set as a list of a list of words

        docs = []
        for k in range(len(test1)):
            sents = []
            review = test1["abstract"].iloc[k]
            sents += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            if title:
                title = test1['title'].iloc[k]
                sents += KaggleWord2VecUtility.review_to_sentences(title, tokenizer)
            if mesh_terms:
                descriptors = test1['descriptor_terms'].iloc[k]
                qualifiers = test1['qualifier_terms'].iloc[k]
                sents += KaggleWord2VecUtility.review_to_sentences(descriptors, tokenizer)
                sents += KaggleWord2VecUtility.review_to_sentences(qualifiers, tokenizer)
            docs.append(sents)

        print "scoring test set"
        # get the probs (note we give docprob our test set plus the models)
        probs = docprob(docs,models).astype(object)

        predictions = np.ones((probs.shape[0]))

        predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model
        print predictions
        print np.size(predictions)

        score = roc_auc_score(test1["boolean_label"], predictions)
        print score
        #append to average
        avg.append(score)


    print("average of 5 rotations:", sum(avg)/float(len(avg)))
    
    return models

In [31]:
a = datetime.datetime.now().replace(microsecond=0)

res = stratified_5_fold_cv(df, True, 'T0')

b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  0.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.]
72
0.613053613054
('ROUND', 2)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  0.  1.  1.  1.  0.  1.  0.  1.  1.  1.  0.  1.  0.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  0.  1.
  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.]
72
0.572261072261
('ROUND', 3)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  0.

In [32]:
a = datetime.datetime.now().replace(microsecond=0)

res1 = stratified_5_fold_cv(df, True, 'T1/T2')
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.
  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
72
0.428571428571
('ROUND', 2)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
71
0.5
('ROUND', 3)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1

In [33]:
a = datetime.datetime.now().replace(microsecond=0)

res2 = stratified_5_fold_cv(df, True, 'T3/T4')
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
71
0.509615384615
('ROUND', 2)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.]
71
0.492914979757
('ROUND', 3)
Parsing sentences from training set
Building and training w2v models
Parsing test sentences
scoring test set
[ 1.  1.  1.  1.  1.

## Performance with abstract, title, and mesh terms on PAPER'S DATASET (much worse than BOWs/RF because so few records)
T0 class- 0.68 (0.63, 0.73)

T1/T2 class- 0.49 (0.45, 0.52)

T3/T4 class- 0.51 (0.48, 0.55)

### Import KaggleWord2VecUtility since didn't work from file

In [7]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences