In [65]:
import pickle
import datetime
import os
import sys

import sklearn
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

In [46]:
#Import data from the processing of data in the journal_training_word2vec notebook
df = pickle.load(open('/home/sarahwie/Documents/pubmed-nlp-research/pickled_objects/dfJournals.p', 'rb'))

In [48]:
print df.shape
print df.columns.values

(422, 3)
['sentiment' 'journal' 'abstract']


In [None]:
#on VM
os.chdir('/mnt/mypartition/Desktop2/pubmed_nlp_research/DeepLearningMovies_datasets/')
import KaggleWord2VecUtility

In [47]:
#on local:
sys.path.append('/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/')
from KaggleWord2VecUtility import KaggleWord2VecUtility
DATADIR='/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/'

### Get training and testing sets from df via cross-validation

In [70]:
a = datetime.datetime.now().replace(microsecond=0)

#5-fold stratified cross validation

#because no validation set, 4/5 of values go to train and 1/5 to test
#is this too high?***
#even though we shuffle, not as randomly distributed as the former method was
y = df["sentiment"].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

j = 1
avg = []
for train_index, test_index in skf:
    print("ROUND", j)
    j = j + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]
    
    print "Cleaning and parsing the training set abstracts...\n"
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the pandas df column
    for i in range(train1["abstract"].size):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        clean_train_reviews.append(KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i] ) )
    
    #with tfidf word weighting
    #uses L2 norm by default
    print "Creating the bag of words...\n"

    # Initialize the "TfidfVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None)
                            #max_features = 5000

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()
    
    print "Training the random forest..."

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators = 100, max_depth=1000) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_data_features, train1["sentiment"] )
    
    #Moving on to test set...
    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = [] 

    print "Cleaning and parsing the test set abstracts...\n"
    for i in range(test1["abstract"].size):
        clean_test_reviews.append(KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i] ) )

    print "Adding tfidf weights..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    print "Using the random forest to make sentiment label predictions..."
    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    #output = pd.DataFrame( data={"id":test1["id"], "sentiment":result} )

    print "Scoring the test set"
    predictions = result
    print predictions
        
    score = roc_auc_score(test1["sentiment"], predictions)
    score = np.size(np.where(predictions == test1["sentiment"]))*1./np.size(predictions)
    print score
    #append to average
    avg.append(score)


print("Average score of 5 rotations:", sum(avg)/float(len(avg)))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set abstracts...

Adding tfidf weights...
Using the random forest to make sentiment label predictions...
Scoring the test set
[0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1
 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 1 1]
0.835294117647
('ROUND', 2)
Cleaning and parsing the training set abstracts...

Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set abstracts...

Adding tfidf weights...
Using the random forest to make sentiment label predictions...
Scoring the test set
[0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0
 1 1 1 1 1 1 1 1 1 0 1]
0.870588235294
('ROUND', 3)
Cleaning and parsing the training set abstracts

In [71]:
print train_data_features.shape

(338, 5766)


In [72]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

[u'abbreviations', u'ability', u'able', u'abnormal', u'abnormalities', u'abolish', u'abolished', u'abolishes', u'about', u'above', u'absence', u'absent', u'absolute', u'absolutely', u'absorbance', u'absorption', u'absorptions', u'abundance', u'abundant', u'acad', u'acanthamoeba', u'acccounting', u'accelerated', u'accelerating', u'accentuated', u'accepted', u'acceptor', u'acceptors', u'accepts', u'access', u'accessible', u'accompanied', u'accompanying', u'accomplished', u'accord', u'according', u'account', u'accounted', u'accounts', u'accumulated', u'accumulates', u'accumulation', u'accurate', u'acetamido', u'acetamidoestrone', u'acetamidoglucose', u'acetamidoglucosyl', u'acetate', u'acetic', u'acetoacetyl', u'acetohydroxy', u'acetolysis', u'acetone', u'acetyl', u'acetylated', u'acetylation', u'acetylene', u'acetylgalactosaminitol', u'acetylglucosamine', u'acetylglutamate', u'acetylneuraminic', u'acetylneuraminyl', u'acetylserine', u'achieved', u'acid', u'acidic', u'acidification', u'ac

In [73]:
#print weights of each word
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

0.133017263488 abbreviations
1.12471512993 ability
0.703249660574 able
0.0595914717491 abnormal
0.11308216626 abnormalities
0.0881912175229 abolish
0.438510891136 abolished
0.0865120584481 abolishes
2.17649884532 about
0.419397829156 above
1.63495203442 absence
0.506923443174 absent
0.447102307557 absolute
0.153880944596 absolutely
0.334530454436 absorbance
0.437600332896 absorption
0.0826272076373 absorptions
0.105825029058 abundance
0.224308285649 abundant
0.125584432808 acad
0.096286551654 acanthamoeba
0.0657974313549 acccounting
0.0516368053195 accelerated
0.0562026918061 accelerating
0.0908261586525 accentuated
0.0882942239601 accepted
1.18738163121 acceptor
0.383692577839 acceptors
0.0826272076373 accepts
0.0852985236737 access
0.290508775248 accessible
0.361537973834 accompanied
0.144036819257 accompanying
0.0905376393802 accomplished
0.0536744057791 accord
0.225557509246 according
0.616993625928 account
0.496919919817 accounted
0.207540816776 accounts
0.347431180022 accumulated

### Import KaggleWord2VecUtility since didn't work from file

In [15]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences