In [1]:
import pickle
import sys
import os
import datetime

import sklearn
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.datasets import dump_svmlight_file
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

from subprocess import call

In [2]:
#Import data from the processing of data in the journal_training_word2vec notebook
#df = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/C_article_replication_and_translational_classification/df.p', 'rb'))
df = pickle.load(open('/mnt/mypartition/pubmed_nlp_research/C_article_replication_and_translational_classification/xml_files_pickled_objects/pickled_objects/dfJournals_trans_categories.p', 'rb'))

In [4]:
print df.shape
print df.columns.values
print df['sentiment'].unique()

(97049, 5)
['sentiment' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms']
['T0' 'T1/T2' 'T3/T4']


In [5]:
#for the journals VM dataframe, change the label of the 'sentiment' column to 'label'
df.columns = ['label', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']

In [6]:
df[0:10]

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms
0,T0,Quantification of the relative contributions o...,Amyotrophic lateral sclerosis (ALS) and fronto...,,
1,T0,Ubiquitin Specific Protease 36 (USP36) control...,Ubiquitination of the TrkA neurotrophin recept...,,
2,T0,Functional Diversity of Cytotoxic tRNase/immun...,Contact-dependent growth inhibition (CDI) is a...,,
3,T0,Identification of a distinct substrate binding...,The type III secretion system (T3SS) effector ...,,
4,T0,Role of chondroitin sulfate (CS) modification ...,Protein tyrosine phosphatase receptor type Z (...,,
5,T0,Chemically programmed bispecific antibodies in...,Chemically programmed bispecific antibodies (b...,,
6,T0,Non-mutagenic Suppression of Enterocyte Ferrop...,Iron transfer across the basolateral membrane ...,,
7,T0,Structural plasticity of cholesteryl ester tra...,Cholesteryl ester transfer protein (CETP) medi...,,
8,T0,Identification of a Membrane-Bound Prepore Spe...,Pore-forming toxins (PFT) are cytolytic protei...,,
9,T0,EspR-dependent ESAT-6 secretion of Mycobacteri...,Attenuation of M. bovis BCG strain is related ...,,


In [11]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

#include mesh terms in bag of words?
mesh_terms = True
#remove stopwords?
stops = False
#if we want to set max features or not for the BOWs
maxF = 5000

j = 1
avgT0 = []
avgT1T2 = []
avgT3T4 = []
for train_index, test_index in skf:
    #get first test/train split and use
    #if j == 1:

    print("ROUND", j)
    j = j + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]

    print "Cleaning and parsing the training set abstracts...\n"
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the pandas df column
    for i in range(train1["abstract"].size):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        words = KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i], remove_stopwords=stops)
        words = words + ' ' + KaggleWord2VecUtility.review_to_words( train1["title"].iloc[i], remove_stopwords=stops)
        if mesh_terms:
            words = words + ' ' + KaggleWord2VecUtility.review_to_words( train1["descriptor_terms"].iloc[i], remove_stopwords=stops)
            words = words + ' ' + KaggleWord2VecUtility.review_to_words( train1["qualifier_terms"].iloc[i], remove_stopwords=stops)
        clean_train_reviews.append(words)

    print len(clean_train_reviews)

    #with tfidf word weighting
    #uses L2 norm by default
    print "Creating the bag of words...\n"

    # Initialize the "TfidfVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                            #already removed stopwords when preprocessing reviews into words
                             stop_words = None,
                            max_features = maxF)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    print len(vectorizer.get_feature_names())

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    
    #Moving on to test set...
    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = [] 

    print "Cleaning and parsing the test set abstracts...\n"
    for i in range(test1["abstract"].size):    
        words = KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i], remove_stopwords=stops)
        words = words + ' ' + KaggleWord2VecUtility.review_to_words( test1["title"].iloc[i], remove_stopwords=stops)
        if mesh_terms:
            words = words + ' ' + KaggleWord2VecUtility.review_to_words( test1["descriptor_terms"].iloc[i], remove_stopwords=stops)
            words = words + ' ' + KaggleWord2VecUtility.review_to_words( test1["qualifier_terms"].iloc[i], remove_stopwords=stops)
        clean_test_reviews.append(words)

    print "Adding tfidf weights and converting to Bag of Words..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()
    
    #------------------------------------------------------------------------------------------------------------    
    #call FEST function
    scoreT0 = FEST('T0', train_data_features, train1, test_data_features, test1, "modelT0")
    print scoreT0
    avgT0.append(scoreT0)
    
    scoreT1T2 = FEST('T1/T2', train_data_features, train1, test_data_features, test1, "modelT1T2")
    print scoreT1T2
    avgT1T2.append(scoreT1T2)
    
    scoreT3T4 = FEST('T3/T4', train_data_features, train1, test_data_features, test1, "modelT3T4")
    print scoreT3T4
    avgT3T4.append(scoreT3T4)

print sum(avgT0) / float(len(avgT0))
print sum(avgT1T2) / float(len(avgT1T2))
print sum(avgT3T4) / float(len(avgT3T4))

os.remove("./probs")
os.remove("./svmlight.dat")
os.remove("./svmlight_test.dat")

b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

77637
Creating the bag of words...

5000
Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Using the random forest to make sentiment label predictions...
Scoring the test set
0.974979048986
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
0.899376941085
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
0.904664952906
('ROUND', 2)
Cleaning and parsing the training set abstracts...

77639
Creating the bag of words...

5000
Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
0.97640461163
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
0.895525854029
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
0.903755464546
('ROUND', 3)
Cleaning and parsing the training s

In [9]:
def FEST(label, train_data_features, train1, test_data_features, test1, modelname):
    print "Training the random forest..."
    
    train1['boolean_label'] = 1
    for row in train1.iterrows():
        if row[1]['label'] != label:
            train1.set_value(row[0], 'boolean_label', 0)
            
    test1['boolean_label'] = 1
    for row in test1.iterrows():
        if row[1]['label'] != label:
            test1.set_value(row[0], 'boolean_label', 0)
    
    dump_svmlight_file(train_data_features, train1.boolean_label,'svmlight.dat',zero_based=True,multilabel=False)

    dump_svmlight_file(test_data_features, test1.boolean_label,'svmlight_test.dat',zero_based=True,multilabel=False)

    call(["/home/nlp-vm//Downloads/fest/festlearn", "-c 3", 
      "./svmlight.dat", modelname])

    print "Using the random forest to make sentiment label predictions..."\

    #get predictions for test set
    call(["/home/nlp-vm/Downloads/fest/festclassify", 
      "./svmlight_test.dat", modelname, "./probs"])

    print "Scoring the test set"
    probs = pd.read_table('./probs', header=None)
    probs.columns = ['probs']
    
    predictions = np.ones((probs.shape[0]))
    predictions[np.where(probs.iloc[:,0] < 0.5)] = 0 # The first column is the negative model

    score = roc_auc_score(test1["boolean_label"], predictions)
    
    return score

In [28]:
def FEST_scoring(label, test_data_features, test1, modelname):
    
    test1['boolean_label'] = 1
    for row in test1.iterrows():
        if row[1]['label'] != label:
            test1.set_value(row[0], 'boolean_label', 0)
    
    dump_svmlight_file(test_data_features, test1.boolean_label,'svmlight_test.dat',zero_based=True,multilabel=False)

    #get predictions for test set
    call(["/home/nlp-vm/Downloads/fest/festclassify", 
      "./svmlight_test.dat", modelname, "./probs"])

    print "Scoring the test set"
    probs = pd.read_table('./probs', header=None)
    probs.columns = ['probs']
    
    predictions = np.ones((probs.shape[0]))
    predictions[np.where(probs.iloc[:,0] < 0.5)] = 0 # The first column is the negative model

    score = roc_auc_score(test1["boolean_label"], predictions)
    
    return score

### External validation on paper's dataset

In [12]:
df_val = pickle.load(open('/mnt/mypartition/pubmed_nlp_research/C_article_replication_and_translational_classification/df.p', 'rb'))

In [25]:
#check that column names are the same
print df_val.shape
print df_val.columns.values
print df_val['label'].unique()
df_val[0:10]

(354, 6)
['pmid' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms' 'label']
['T0' 'T1/T2' 'T3/T4' 'TX']


In [26]:
#Moving on to test set...
# Create an empty list and append the clean reviews one by one
clean_test_reviews_val = [] 

print "Cleaning and parsing the test set abstracts...\n"
for i in range(df_val["abstract"].size):    
    words = KaggleWord2VecUtility.review_to_words( df_val["abstract"].iloc[i], remove_stopwords=stops)
    words = words + ' ' + KaggleWord2VecUtility.review_to_words( df_val["title"].iloc[i], remove_stopwords=stops)
    if mesh_terms:
        words = words + ' ' + KaggleWord2VecUtility.review_to_words( df_val["descriptor_terms"].iloc[i], remove_stopwords=stops)
        words = words + ' ' + KaggleWord2VecUtility.review_to_words( df_val["qualifier_terms"].iloc[i], remove_stopwords=stops)
    clean_test_reviews_val.append(words)

print "Adding tfidf weights and converting to Bag of Words..."
# Get a bag of words for the test set, and convert to a numpy array
test_data_features_val = vectorizer.transform(clean_test_reviews_val)
test_data_features_val = test_data_features_val.toarray()

Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...


In [29]:
#------------------------------------------------------------------------------------------------------------    
#call FEST function
scoreT0 = FEST_scoring('T0', test_data_features_val, df_val, "modelT0")
print scoreT0

scoreT1T2 = FEST_scoring('T1/T2', test_data_features_val, df_val, "modelT1T2")
print scoreT1T2

scoreT3T4 = FEST_scoring('T3/T4', test_data_features_val, df_val, "modelT3T4")
print scoreT3T4

os.remove("./probs")
os.remove("./svmlight_test.dat")

Scoring the test set
0.619695216049
Scoring the test set
0.724444672974
Scoring the test set
0.717062193126


In [7]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences