In [1]:
import pickle
import sys
import os
import datetime

import sklearn
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

In [2]:
#Import data from the processing of data in the journal_training_word2vec notebook
#df = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/C_article_replication_and_translational_classification/df.p', 'rb'))
df = pickle.load(open('/mnt/mypartition/pubmed_nlp_research/C_article_replication_and_translational_classification/pickled_objects/dfJournals_trans_categories.p', 'rb'))

In [3]:
print df.shape
print df.columns.values
print df['sentiment'].unique()

(97048, 5)
['sentiment' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms']
['T0' 'T1/T2' 'T3/T4']


In [4]:
#for the journals VM dataframe, change the label of the 'sentiment' column to 'label'
df.columns = ['label', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']

In [5]:
df[0:10]

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms
0,T0,Quantification of the relative contributions o...,Amyotrophic lateral sclerosis (ALS) and fronto...,,
1,T0,Ubiquitin Specific Protease 36 (USP36) control...,Ubiquitination of the TrkA neurotrophin recept...,,
2,T0,Functional Diversity of Cytotoxic tRNase/immun...,Contact-dependent growth inhibition (CDI) is a...,,
3,T0,Identification of a distinct substrate binding...,The type III secretion system (T3SS) effector ...,,
4,T0,Role of chondroitin sulfate (CS) modification ...,Protein tyrosine phosphatase receptor type Z (...,,
5,T0,Chemically programmed bispecific antibodies in...,Chemically programmed bispecific antibodies (b...,,
6,T0,Non-mutagenic Suppression of Enterocyte Ferrop...,Iron transfer across the basolateral membrane ...,,
7,T0,Structural plasticity of cholesteryl ester tra...,Cholesteryl ester transfer protein (CETP) medi...,,
8,T0,Identification of a Membrane-Bound Prepore Spe...,Pore-forming toxins (PFT) are cytolytic protei...,,
9,T0,EspR-dependent ESAT-6 secretion of Mycobacteri...,Attenuation of M. bovis BCG strain is related ...,,


In [12]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

#if we want to set max features or not for the BOWs
maxF = 5000

j = 1
avgAll = []
avgT0 = []
avgT1T2 = []
avgT3T4 = []

avgAll_val = []
avgT0_val = []
avgT1T2_val = []
avgT3T4_val = []

for train_index, test_index in skf:
    
    print("ROUND", j)
    j = j + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]

    print "Cleaning and parsing the training set abstracts...\n"
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    for i in range(train1["abstract"].size):
        # Call our function for each one, and add the result to the list of clean reviews
        words = KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i])
        clean_train_reviews.append(words)

    print len(clean_train_reviews)

    #with tfidf word weighting
    #uses L2 norm by default
    print "Creating the bag of words...\n"

    # Initialize the "TfidfVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                            #remove stopwords?
                             stop_words = None,
                            max_features = maxF)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    print len(vectorizer.get_feature_names())

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    print train_data_features.shape

    #Moving on to test set...
    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = [] 

    print "Cleaning and parsing the test set abstracts...\n"
    for i in range(test1["abstract"].size):    
        words = KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i])
        clean_test_reviews.append(words)

    print "Adding tfidf weights and converting to Bag of Words..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    #------------------------------------------------------------------------------------------------------------    
    print "Training the random forest..."

    #binarize labels
    x = label_binarize(train1['label'], classes=['T0', 'T1/T2', 'T3/T4'])
    n_classes = x.shape[1]
    print n_classes

    x_test = label_binarize(test1['label'], classes=['T0', 'T1/T2', 'T3/T4'])

    # Initialize a Random Forest classifier with 100 trees
    forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100, max_depth=1000, n_jobs=-1))

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_data_features, x )

    #get the feature importance for each of the three fits
    print len(forest.estimators_)
    feats = []
    for i in range(len(forest.estimators_)):
        feats.append(forest.estimators_[i].feature_importances_)

    print "Using the random forest to make sentiment label predictions..."
    result = forest.predict(test_data_features)

    print "Scoring the test set"

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(x_test[:, i], result[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    print roc_auc

    avgT0.append(roc_auc[0])
    avgT1T2.append(roc_auc[1])
    avgT3T4.append(roc_auc[2])
    avgAll.append(roc_auc[0])
    avgAll.append(roc_auc[1])
    avgAll.append(roc_auc[2])
    
    #------------------------------------------------------------------------------------------------------------    
    print "Testing on external validation set..."

    print "Adding tfidf weights and converting to Bag of Words..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features_val = vectorizer.transform(clean_test_reviews_val)
    test_data_features_val = test_data_features_val.toarray()
    
    print "Using the random forest to make sentiment label predictions..."
    # Use the random forest to make sentiment label predictions
    #using forest fitted on training data
    result_val = forest.predict(test_data_features_val)

    x_test_val = label_binarize(df_val['label'], classes=['T0', 'T1/T2', 'T3/T4'])

    print "Scoring the test set"
    # Compute ROC curve and ROC area for each class
    fpr_val = dict()
    tpr_val = dict()
    roc_auc_val = dict()
    for i in range(n_classes):
        fpr_val[i], tpr_val[i], _ = roc_curve(x_test_val[:, i], result_val[:, i])
        roc_auc_val[i] = auc(fpr_val[i], tpr_val[i])

    print roc_auc_val
    
    avgT0_val.append(roc_auc_val[0])
    avgT1T2_val.append(roc_auc_val[1])
    avgT3T4_val.append(roc_auc_val[2])
    avgAll_val.append(roc_auc_val[0])
    avgAll_val.append(roc_auc_val[1])
    avgAll_val.append(roc_auc_val[2])

print ""
print "---------------------------------------------------------------------------------"
print "TEST SET"
print "Average for T0 model:"
avgT0 = sum(avgT0) / float(len(avgT0))
print avgT0
print "Average for T1/T2 model:"
avgT1T2 = sum(avgT1T2) / float(len(avgT1T2))
print avgT1T2
print "Average for T3/T4 model:"
avgT3T4 = sum(avgT3T4) / float(len(avgT3T4))
print avgT3T4
print ""
print "Average across all models:"
print sum(avgAll) / float(len(avgAll))

print ""
print "---------------------------------------------------------------------------------"
print "VALIDATION SET"
print "Average for T0 model:"
avgT0_val = sum(avgT0_val) / float(len(avgT0_val))
print avgT0_val
print "Average for T1/T2 model:"
avgT1T2_val = sum(avgT1T2_val) / float(len(avgT1T2_val))
print avgT1T2_val
print "Average for T3/T4 model:"
avgT3T4_val = sum(avgT3T4_val) / float(len(avgT3T4_val))
print avgT3T4_val
print ""
print "Average across all models:"
print sum(avgAll_val) / float(len(avgAll_val))

b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

77637
Creating the bag of words...

5000
(77637, 5000)
Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...
3
3
Using the random forest to make sentiment label predictions...
Scoring the test set
{0: 0.96667583127426104, 1: 0.87173826329066351, 2: 0.88623631420305227}
Testing on external validation set...
Adding tfidf weights and converting to Bag of Words...
Using the random forest to make sentiment label predictions...
Scoring the test set
{0: 0.63580246913580241, 1: 0.69888934594816943, 2: 0.70507364975450082}
('ROUND', 2)
Cleaning and parsing the training set abstracts...

77638
Creating the bag of words...

5000
(77638, 5000)
Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...
3


KeyboardInterrupt: 

In [26]:
#our vocab is in the vectorizer- same for all
pickle.dump(vectorizer, open('/mnt/mypartition/reproducing_methods/vectorizerJournals.p', 'wb'))
pickle.dump(forest, open('/mnt/mypartition/reproducing_methods/forestJournals.p', 'wb'))

### External validation on paper's dataset

In [9]:
df_val = pickle.load(open('/mnt/mypartition/pubmed_nlp_research/C_article_replication_and_translational_classification/df.p', 'rb'))

In [10]:
#check that column names are the same
print df_val.shape
print df_val.columns.values
print df_val['label'].unique()
df_val[0:10]

(354, 6)
['pmid' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms' 'label']
['T0' 'T1/T2' 'T3/T4' 'TX']


Unnamed: 0,pmid,title,abstract,qualifier_terms,descriptor_terms,label
0,23195993,Gene expression profiles in peripheral blood m...,Occupational exposure to nickel (Ni) is associ...,genetics metabolism epidemiology drug effects ...,Adult Asian Continental Ancestry Group Biomark...,T0
1,25077433,SOX2 is a cancer-specific regulator of tumour ...,Although the principles that balance stem cell...,genetics metabolism pathology metabolism patho...,"Animals Carcinoma, Squamous Cell Cell Line, Tu...",T0
2,24107601,Imaging and cerebrospinal fluid biomarkers in ...,The pathophysiological process of Alzheimer's ...,cerebrospinal fluid genetics radionuclide imag...,"Aged Aged, 80 and over Alzheimer Disease Amylo...",T0
3,24891029,Preliminary evidence of cognitive and brain ab...,To ascertain whether pediatric obesity without...,pathology physiopathology physiopathology,Adolescent Attention Brain Cognition Executive...,T0
4,21691448,Obese Adolescents with Type 2 Diabetes Mellitu...,The rates of type 2 diabetes (T2DM) continue t...,,,T0
5,22765222,Diversity of 5S rRNA genes within individual p...,We examined intragenomic variation of paralogo...,chemistry classification genetics chemistry ge...,"Bacteria DNA, Ribosomal Databases, Nucleic Aci...",T0
6,24401686,Myoinositol and glutamate complex neurometabol...,To obtain quantitative neurometabolite measure...,analogs & derivatives metabolism metabolism me...,Adult Aspartic Acid Brain Injuries Case-Contro...,T0
7,22914093,Antibiotics in early life alter the murine col...,Antibiotics administered in low doses have bee...,drug effects physiology administration & dosag...,Adiposity Age Factors Animals Anti-Bacterial A...,T0
8,23426830,Elevated serum anti-Müllerian hormone in adole...,Serum anti-Müllerian hormone (AMH) is linked t...,blood pathology ultrasonography blood ultrason...,Adolescent Anti-Mullerian Hormone Child Female...,T0
9,24344399,Association of obesity-mediated insulin resist...,The hypothalamus is important in hunger and me...,blood analysis anatomy & histology blood blood...,Adolescent Adult Brain-Derived Neurotrophic Fa...,T0


In [None]:
#Optional: Remove 'TX' scores to test performance
df_val = df_val[df_val['label'] != 'TX']
print len(df_val)

In [11]:
#Moving on to test set...
# Create an empty list and append the clean reviews one by one
clean_test_reviews_val = [] 

print "Cleaning and parsing the test set abstracts...\n"
for i in range(df_val["abstract"].size):    
    words = KaggleWord2VecUtility.review_to_words( df_val["abstract"].iloc[i])
    clean_test_reviews_val.append(words)

Cleaning and parsing the test set abstracts...



In [7]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

from nltk import PorterStemmer

class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False, stemmer=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
            
        if stemmer:
            words = [PorterStemmer().stem_word(w) for w in words]
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences