In [1]:
import pickle
import datetime
import os
import sys

import sklearn
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

In [2]:
#Import data from the processing of data in the journal_training_word2vec notebook
#df = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/df.p', 'rb'))
df = pickle.load(open('/mnt/mypartition/Desktop2/dfJournals_trans_categories.p', 'rb'))

In [3]:
print df.shape
print df.columns.values

(97049, 5)
['sentiment' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms']


In [4]:
#for the journals VM dataframe, change the label of the 'sentiment' column to 'label'
df.columns = ['label', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']

In [5]:
df[0:10]

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms
0,T0,Quantification of the relative contributions o...,Amyotrophic lateral sclerosis (ALS) and fronto...,,
1,T0,Ubiquitin Specific Protease 36 (USP36) control...,Ubiquitination of the TrkA neurotrophin recept...,,
2,T0,Functional Diversity of Cytotoxic tRNase/immun...,Contact-dependent growth inhibition (CDI) is a...,,
3,T0,Identification of a distinct substrate binding...,The type III secretion system (T3SS) effector ...,,
4,T0,Role of chondroitin sulfate (CS) modification ...,Protein tyrosine phosphatase receptor type Z (...,,
5,T0,Chemically programmed bispecific antibodies in...,Chemically programmed bispecific antibodies (b...,,
6,T0,Non-mutagenic Suppression of Enterocyte Ferrop...,Iron transfer across the basolateral membrane ...,,
7,T0,Structural plasticity of cholesteryl ester tra...,Cholesteryl ester transfer protein (CETP) medi...,,
8,T0,Identification of a Membrane-Bound Prepore Spe...,Pore-forming toxins (PFT) are cytolytic protei...,,
9,T0,EspR-dependent ESAT-6 secretion of Mycobacteri...,Attenuation of M. bovis BCG strain is related ...,,


In [4]:
#on VM
os.chdir('/mnt/mypartition/Desktop2/pubmed_nlp_research/DeepLearningMovies_datasets/')
import KaggleWord2VecUtility

In [7]:
#on local:
sys.path.append('/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/')
from KaggleWord2VecUtility import KaggleWord2VecUtility
DATADIR='/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/'

### Assign 2-class labels (for example, T0 or not T0)

In [13]:
df['boolean_label'] = 1
for row in df.iterrows():
    if row[1]['label'] != 'T0':
        df.set_value(row[0], 'boolean_label', 0)

In [14]:
df

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms,boolean_label
0,T0,Quantification of the relative contributions o...,Amyotrophic lateral sclerosis (ALS) and fronto...,,,1
1,T0,Ubiquitin Specific Protease 36 (USP36) control...,Ubiquitination of the TrkA neurotrophin recept...,,,1
2,T0,Functional Diversity of Cytotoxic tRNase/immun...,Contact-dependent growth inhibition (CDI) is a...,,,1
3,T0,Identification of a distinct substrate binding...,The type III secretion system (T3SS) effector ...,,,1
4,T0,Role of chondroitin sulfate (CS) modification ...,Protein tyrosine phosphatase receptor type Z (...,,,1
5,T0,Chemically programmed bispecific antibodies in...,Chemically programmed bispecific antibodies (b...,,,1
6,T0,Non-mutagenic Suppression of Enterocyte Ferrop...,Iron transfer across the basolateral membrane ...,,,1
7,T0,Structural plasticity of cholesteryl ester tra...,Cholesteryl ester transfer protein (CETP) medi...,,,1
8,T0,Identification of a Membrane-Bound Prepore Spe...,Pore-forming toxins (PFT) are cytolytic protei...,,,1
9,T0,EspR-dependent ESAT-6 secretion of Mycobacteri...,Attenuation of M. bovis BCG strain is related ...,,,1


### Get training and testing sets from df via cross-validation

In [16]:
a = datetime.datetime.now().replace(microsecond=0)

#5-fold stratified cross validation
#because no validation set, 4/5 of values go to train and 1/5 to test
y = df['boolean_label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

j = 1
avg = []
for train_index, test_index in skf:
    if j == 1:
        
        print("ROUND", j)
        j = j + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[train_index], df.iloc[test_index]

        print "Cleaning and parsing the training set abstracts...\n"
        # Initialize an empty list to hold the clean reviews
        clean_train_reviews = []

        # Loop over each review; create an index i that goes from 0 to the length
        # of the pandas df column
        for i in range(train1["abstract"].size):
            # Call our function for each one, and add the result to the list of
            # clean reviews
            words = KaggleWord2VecUtility.review_to_words( train1["title"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( train1["descriptor_terms"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( train1["qualifier_terms"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i])
            clean_train_reviews.append(words)

        print len(clean_train_reviews)
        #with tfidf word weighting
        #uses L2 norm by default
        print "Creating the bag of words...\n"

        # Initialize the "TfidfVectorizer" object, which is scikit-learn's
        # bag of words tool.  
        vectorizer = TfidfVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,
                                max_features = 5000)

        # fit_transform() does two functions: First, it fits the model
        # and learns the vocabulary; second, it transforms our training data
        # into feature vectors. The input to fit_transform should be a list of 
        # strings.
        train_data_features = vectorizer.fit_transform(clean_train_reviews)

        # Numpy arrays are easy to work with, so convert the result to an 
        # array
        train_data_features = train_data_features.toarray()

        print "Training the random forest..."

        # Initialize a Random Forest classifier with 100 trees
        forest = RandomForestClassifier(n_estimators = 100, max_depth=1000)

        # Fit the forest to the training set, using the bag of words as 
        # features and the sentiment labels as the response variable
        #
        # This may take a few minutes to run
        forest = forest.fit( train_data_features, train1["boolean_label"] )
        feats = forest.feature_importances_ 
        
        #Moving on to test set...
        # Create an empty list and append the clean reviews one by one
        clean_test_reviews = [] 

        print "Cleaning and parsing the test set abstracts...\n"
        for i in range(test1["abstract"].size):    
            words = KaggleWord2VecUtility.review_to_words( test1["title"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( test1["descriptor_terms"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( test1["qualifier_terms"].iloc[i])
            words = words + KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i])
            clean_test_reviews.append(words)

        print "Adding tfidf weights..."
        # Get a bag of words for the test set, and convert to a numpy array
        test_data_features = vectorizer.transform(clean_test_reviews)
        test_data_features = test_data_features.toarray()
        #print test_data_features.shape

        print "Using the random forest to make sentiment label predictions..."
        # Use the random forest to make sentiment label predictions
        result = forest.predict(test_data_features)

        print "Scoring the test set"
        predictions = result
        print predictions

        score = roc_auc_score(test1["boolean_label"], predictions)
        #score = np.size(np.where(predictions == test1["boolean_label"]))*1./np.size(predictions)
        print score
    #append to average
    #avg.append(score)


#print("Average score of 5 rotations:", sum(avg)/float(len(avg)))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

77638
Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set abstracts...

Adding tfidf weights...
Using the random forest to make sentiment label predictions...
Scoring the test set
[1 1 1 ..., 0 0 0]
0.972469441256
0:05:03


In [26]:
print len(feats[feats > 0.001])
#print np.where(feats > 0.01)
print np.where(feats > 0.02)
print len(feats)
print feats
print sum(feats)

127
(array([ 871, 1701, 2763, 3297, 3633, 3661, 4937]),)
5000
[  3.44099074e-05   1.86877263e-05   2.30721342e-05 ...,   2.45329689e-05
   1.09204644e-05   1.14362277e-05]
1.0


In [None]:
#T0 and not T0 performance:
# 0.97 (+/- 0.0)

In [None]:
#T1/T2 and not T1/T2 performance:
# 0.90 (+/- )

In [None]:
#T3/T4 and not T3/T4 performance:
# 0.90

In [None]:
#Without MeSH terms: TO performance is 0.96

In [None]:
#T1/T2 performance is 0.88

In [None]:
#T3/T4 performance is 0.89

### External validation on paper's dataset

### Testing with no title or mesh terms in BOWs (just abstract)- slightly worse performance.

In [9]:
df2 = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/df.p', 'rb'))

In [12]:
df2['boolean_label'] = 1
for row in df2.iterrows():
    if row[1]['label'] != 'T3/T4':
        df2.set_value(row[0], 'boolean_label', 0)

In [13]:
a = datetime.datetime.now().replace(microsecond=0)

#5-fold stratified cross validation
#because no validation set, 4/5 of values go to train and 1/5 to test
y = df2['boolean_label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

j = 1
avg = []
for train_index, test_index in skf:
    print("ROUND", j)
    j = j + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df2.iloc[train_index], df2.iloc[test_index]
    
    print "Cleaning and parsing the training set abstracts...\n"
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the pandas df column
    for i in range(train1["abstract"].size):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        words = KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i])
        clean_train_reviews.append(words)
    
    #with tfidf word weighting
    #uses L2 norm by default
    print "Creating the bag of words...\n"

    # Initialize the "TfidfVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None)
                            #max_features = 5000

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    #print train_data_features.shape
    
    print "Training the random forest..."

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators = 100, max_depth=1000) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_data_features, train1["boolean_label"] )
    
    #Moving on to test set...
    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = [] 

    print "Cleaning and parsing the test set abstracts...\n"
    for i in range(test1["abstract"].size):    
        words = KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i])
        clean_test_reviews.append(words)

    print "Adding tfidf weights..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    print "Using the random forest to make sentiment label predictions..."
    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)

    print "Scoring the test set"
    predictions = result
    print predictions
        
    score = roc_auc_score(test1["boolean_label"], predictions)
    #score = np.size(np.where(predictions == test1["boolean_label"]))*1./np.size(predictions)
    print score
    #append to average
    avg.append(score)


print("Average score of 5 rotations:", sum(avg)/float(len(avg)))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set abstracts...

Adding tfidf weights...
Using the random forest to make sentiment label predictions...
Scoring the test set
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.746478873239
('ROUND', 2)
Cleaning and parsing the training set abstracts...

Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set abstracts...

Adding tfidf weights...
Using the random forest to make sentiment label predictions...
Scoring the test set
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
0.732394366197
('ROUND', 3)
Cleaning and parsing the training set abstracts...

Creating the bag of words...

Training the random for

In [75]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab[0:10]

[u'aa', u'aad', u'aao', u'ab', u'aba', u'abatacept', u'abc', u'abdominal', u'aberrant', u'aberrations', u'abilities', u'ability', u'ablation', u'ablationcarcinoma', u'able', u'abnormal', u'abnormalities', u'abnormalitiesendothelial', u'abnormality', u'abolished', u'about', u'above', u'abr', u'abrogated', u'abs', u'absence', u'absent', u'absolute', u'absorbed', u'absorptiometry', u'absorption', u'abstainers', u'abstinence', u'abstinent', u'abstract', u'abstraction', u'abstractness', u'abt', u'abundance', u'abuse', u'abv', u'academic', u'acamprosate', u'accelerate', u'accelerated', u'acceleration', u'accentuated', u'acceptability', u'acceptable', u'acceptance', u'accepted', u'access', u'accessibility', u'accessible', u'accompanied', u'accomplish', u'accomplished', u'according', u'accordingly', u'account', u'accountability', u'accounted', u'accounting', u'accounts', u'accreditation', u'accrual', u'accumbens', u'accumulation', u'accuracy', u'accurate', u'accurately', u'acd', u'acetyl', u'a

In [76]:
#print weights of each word:

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

0.153038730708 aa
0.0982133873484 aad
0.334261857 aao
0.132387004021 ab
0.223450859745 aba
0.62767891362 abatacept
0.149324752465 abc
0.216375794313 abdominal
0.154545326897 aberrant
0.201390386614 aberrations
0.176343848339 abilities
0.553097200177 ability
0.418683746599 ablation
0.0837367493198 ablationcarcinoma
0.449641660288 able
0.368944593203 abnormal
0.531855379495 abnormalities
0.107828960621 abnormalitiesendothelial
0.0537493651298 abnormality
0.144117357901 abolished
1.20426304046 about
0.310155988491 above
0.431345428468 abr
0.0510360739864 abrogated
0.0661935020106 abs
0.394416204582 absence
0.0915959557159 absent
0.20254276448 absolute
0.0493006027346 absorbed
0.121234568067 absorptiometry
0.17175521472 absorption
0.0683689688275 abstainers
0.0543713854538 abstinence
0.271856927269 abstinent
0.43723205745 abstract
0.0747605149712 abstraction
0.0747605149712 abstractness
0.210819100895 abt
0.0828337070841 abundance
0.297236452902 abuse
0.105409550447 abv
0.307286184067 acad

### Import KaggleWord2VecUtility since didn't work from file

In [6]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences