In [29]:
import pickle
import numpy as np
import datetime
import nltk

In [30]:
df = pickle.load(open('/mnt/mypartition/pubmed_nlp_research/C_article_replication_and_translational_classification/pickled_objects/dfJournals_trans_categories.p', 'rb'))

In [31]:
#check labels- great
print df.columns.values
print df['sentiment'].unique()

df.columns = ['label', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']

['sentiment' 'title' 'abstract' 'qualifier_terms' 'descriptor_terms']
['T0' 'T1/T2' 'T3/T4']


In [32]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [33]:
df[0:10]

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms
0,T0,Quantification of the relative contributions o...,Amyotrophic lateral sclerosis (ALS) and fronto...,,
1,T0,Ubiquitin Specific Protease 36 (USP36) control...,Ubiquitination of the TrkA neurotrophin recept...,,
2,T0,Functional Diversity of Cytotoxic tRNase/immun...,Contact-dependent growth inhibition (CDI) is a...,,
3,T0,Identification of a distinct substrate binding...,The type III secretion system (T3SS) effector ...,,
4,T0,Role of chondroitin sulfate (CS) modification ...,Protein tyrosine phosphatase receptor type Z (...,,
5,T0,Chemically programmed bispecific antibodies in...,Chemically programmed bispecific antibodies (b...,,
6,T0,Non-mutagenic Suppression of Enterocyte Ferrop...,Iron transfer across the basolateral membrane ...,,
7,T0,Structural plasticity of cholesteryl ester tra...,Cholesteryl ester transfer protein (CETP) medi...,,
8,T0,Identification of a Membrane-Bound Prepore Spe...,Pore-forming toxins (PFT) are cytolytic protei...,,
9,T0,EspR-dependent ESAT-6 secretion of Mycobacteri...,Attenuation of M. bovis BCG strain is related ...,,


In [35]:
#dataframe includes one record without an abstract hmm- FIXED
df.loc[df['abstract'] =='']

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms


### Set random seed (*run once*)

In [37]:
np.random.seed(10)

### Generate probssentences for T0, then T1/T2 and T3/T4

In [36]:
#add boolean labels
df['boolean_label'] = 1
for row in df.iterrows():
    if row[1]['label'] != 'T0':
        df.set_value(row[0], 'boolean_label', 0)

In [38]:
#not doing stratified but just shuffling to match method used for probssentences (movie reviews)
#do a larger split
fracTrain = 0.80
nSamples = df.shape[0]
order = np.random.permutation(nSamples) # come up with a random ordering
splitIndex = int(np.round(nSamples*fracTrain))
train1 = df.ix[order[:splitIndex],:]
test1 = df.ix[order[splitIndex:],:]

In [39]:
pickle.dump(train1, open('./pickled_objects/train1T0.p', 'wb'))
pickle.dump(test1, open('./pickled_objects/test1T0.p', 'wb'))

In [40]:
print train1.shape
print test1.shape

(77638, 6)
(19410, 6)


In [41]:
train1.loc[train1['abstract'] == '']

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms,boolean_label


In [42]:
test1.loc[test1['abstract'] == '']

Unnamed: 0,label,title,abstract,qualifier_terms,descriptor_terms,boolean_label


In [43]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprobsentences(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    return prob

In [46]:
a = datetime.datetime.now().replace(microsecond=0)

# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences_pos = []  # Initialize an empty list of sentences
sentences_neg = []  # Initialize an empty list of sentences

inxs_pos = np.where(train1["boolean_label"] == 1)[0].tolist()
inxs_neg = np.where(train1["boolean_label"] == 0)[0].tolist()

print "Parsing sentences from training set"
for inx in inxs_pos:
    review = train1["abstract"].iloc[inx]
    sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

for inx in inxs_neg:
    review = train1["abstract"].iloc[inx]
    sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)    


# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences = []  # Initialize an empty list of sentences
docsTrain = []
for review in train1["abstract"]:
    result = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
    sentences += result
    docsTrain.append(result)
        
        
print "Building Word2Vec model"
from gensim.models import Word2Vec
import multiprocessing

## create a w2v learner 
basemodel = Word2Vec(
    workers=multiprocessing.cpu_count(), # use your cores
    iter=3, # iter = sweeps of SGD through the data; more is better
    hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
    )       
basemodel.build_vocab(sentences) 
        
from copy import deepcopy
models = [deepcopy(basemodel) for i in range(2)]
models[0].train(sentences_neg, total_examples=len(sentences_neg) )
models[1].train(sentences_pos, total_examples=len(sentences_pos) )

print "Parsing sentences from test set"
# read in the test set as a list of a list of words
docs = []
for review in test1["abstract"]:
    docs.append(KaggleWord2VecUtility.review_to_sentences(review, tokenizer))

print "scoring test set"
# get the probs (note we give docprob our test set plus the models)
#probs = docprob(docs,models).astype(object)
probssentencesArticle = docprobsentences(docs,models).astype(object)

probssentences_train = docprobsentences(docsTrain,models).astype(object)

pickle.dump(probssentencesArticle, open('./pickled_objects/probssentencesJournalsDatasetT0.p', 'wb'))
pickle.dump(probssentences_train, open('./pickled_objects/probssentencesJournalsDataset_trainT0.p', 'wb'))
        
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

Parsing sentences from training set




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.

Building Word2Vec model
Parsing sentences from test set


AttributeError: 'float' object has no attribute 'decode'

In [16]:
print probssentencesArticle.shape
print probssentences_train.shape

(160654, 3)


In [17]:
#print probssentencesArticle
print len(probssentencesArticle['doc'].unique())
print len(probssentences_train['doc'].unique())

19409


### Kaggle Word2VecUtility

In [45]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences