#### Shannon Hamilton, ANLP 2016

## Part 1: K-Means vs LDA

### Imports + reading file

In [1]:
# below code adapted from Brandon Rose's blog post on clustering: 
# http://brandonrose.org/clustering

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction

In [2]:
# code to break up my txt file of pubmed articles into a single list
# each article saved as an item in the list
# sourced here, http://stackoverflow.com/questions/22364468/how-to-split-txt-file-to-multiple-file-base-on-content

import re

def open_chunk(readfunc, delimiter, chunksize=1024):
    """
    http://stackoverflow.com/a/17508761/190597
    readfunc(chunksize) should return a string.
    """
    remainder = ''
    for chunk in iter(lambda: readfunc(chunksize), ''):
        pieces = re.split(delimiter, remainder + chunk)
        for piece in pieces[:-1]:
            yield piece
        remainder = pieces[-1]
    if remainder:
        yield remainder

with open('pubmed_depression.txt', 'r') as infile:
    pubmed_list = []
    chunks = open_chunk(infile.read, delimiter=r'(PMID.*)')
    for i, (chunk, delim) in enumerate(zip(*[chunks]*2)):
        chunk = chunk+delim
        chunk = chunk.strip()
        if chunk:
            pubmed_list.append(chunk)

### Stemming, Lemmatizing and Tokenizing

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [4]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [5]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in pubmed_list:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [6]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 80582 items in vocab_frame


### Tf-idf and document similarity

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(pubmed_list) #fit the vectorizer to text

print(tfidf_matrix.shape)

CPU times: user 4.14 s, sys: 44.8 ms, total: 4.19 s
Wall time: 4.21 s
(250, 308)


In [8]:
terms = tfidf_vectorizer.get_feature_names()


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

### K-means Clustering

For K-Means, I changed num_clusters to 3, 5, 10, 20 and found 10 to provide the best results. Honestly, the clusters are still a bit muddled (my corpous is a set of 250 pubmed articles, all on mental health). It was interesting to see studies clustered that are US (#8, 9) based versus Canada-based (#6, 7), or themes of studies (ie: suicide/9, anxiety/4, bipolar/3, women/1). 

In [10]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 283 ms, sys: 3.33 ms, total: 286 ms
Wall time: 289 ms


In [11]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [12]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
#     print("Cluster %d titles:" % i, end='')
#     for title in frame.ix[i]['title'].values.tolist():
#         print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

Top terms per cluster:

Cluster 0 words: b'study', b'intervention', b'trials', b'effect', b'therapy', b'outcomes', b'treatments', b'systematic', b'participants', b'psychological',



Cluster 1 words: b'health', b'mental', b'women', b'mental', b'study', b'department', b'public', b'public', b'prevalent', b"'s",



Cluster 2 words: b'effect', b'therapy', b'electronic', b'electronic', b'addressing', b'conditions', b'ltd.', b'study', b'disorder', b'ltd.',



Cluster 3 words: b'disorder', b'response', b'institute', b'usa', b'behavioral', b'research', b'department', b'brain', b'psychiatry', b'process',



Cluster 4 words: b'anxiety', b'depression', b'symptoms', b'department', b'study', b'united', b'associated', b'prevalent', b'health', b'patients',



Cluster 5 words: b'patients', b'disease', b'center', b'carefully', b'factors', b'medical', b'medical', b'medicinal', b'risk', b'risk',



Cluster 6 words: b'drug', b'disease', b'disorder', b'treatments', b'clinical', b'symptoms', b'therapy', b'c

### LDA

I appreciated being able to compare results from LDA to K-Means. LDA clusters were also a bit muddled, but again, I think clinical trial themes seem to bubble up pretty well: diabetes/2, maternal + women/3, chronic pain/4, sleep-related/6, suicide/7, anxiety/8. I chose to oput 10 clusters so that I could compare results more easily with K-Means. Exciting! 

In [13]:
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [14]:
from gensim import corpora, models, similarities 

# with open('pubmed_depression_bodytext.txt', 'r') as handle:
#     raw = handle.read().replace('\n', ' ')

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in pubmed_list]

#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

#remove stop words
%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]



CPU times: user 1.95 s, sys: 9.04 ms, total: 1.96 s
Wall time: 1.98 s
CPU times: user 1.76 s, sys: 13.2 ms, total: 1.77 s
Wall time: 1.79 s
CPU times: user 148 ms, sys: 843 µs, total: 149 ms
Wall time: 149 ms


In [15]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [50]:
%time lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 3min 34s, sys: 2.53 s, total: 3min 37s
Wall time: 3min 50s


In [51]:
lda.show_topics()

[(0,
  '0.017*"studi" + 0.015*"intervent" + 0.015*"review" + 0.012*"includ" + 0.011*"health" + 0.010*"provid" + 0.009*"use" + 0.009*"effect" + 0.008*"popul" + 0.008*"address"'),
 (1,
  '0.008*"massag" + 0.007*"function" + 0.006*"mental" + 0.006*"health" + 0.006*"evid" + 0.006*"risk" + 0.005*"stress" + 0.005*"opioid" + 0.005*"process" + 0.005*"diet"'),
 (2,
  '0.013*"disord" + 0.013*"treatment" + 0.011*"patient" + 0.011*"diseas" + 0.010*"studi" + 0.008*"review" + 0.008*"includ" + 0.008*"diabet" + 0.007*"process" + 0.007*"clinic"'),
 (3,
  '0.010*"studi" + 0.009*"stroke" + 0.008*"matern" + 0.008*"develop" + 0.007*"disord" + 0.007*"mechan" + 0.007*"women" + 0.006*"function" + 0.005*"dure" + 0.005*"process"'),
 (4,
  '0.026*"pain" + 0.020*"chronic" + 0.015*"de" + 0.011*"dystonia" + 0.010*"du" + 0.009*"antibodi" + 0.008*"intervent" + 0.008*"patient" + 0.008*"factor" + 0.008*"prevent"'),
 (5,
  '0.024*"studi" + 0.019*"effect" + 0.018*"intervent" + 0.016*"trial" + 0.012*"therapi" + 0.011*"inc

## Part 2: Word2Vec vs WordNet

For nouns, I believe Word2Vec provides nouns more inline with the context of my corpus. For adjectives, I believe Word2Vec also is better than WordNet. There appears to be more word options and again, are more in line with the context of my corpus. For verbs, I think that WordNet did better than Word2Vec. There appears to be more verb options available, and Word2Vec appears to just put forth other forms of the same verb (ie: cures > cures, curing, cured).  

In [18]:
import nltk
import numpy as np
import gensim
from gensim.models import Word2Vec
from nltk.data import find
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [19]:
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)

#### Nouns

##### Word2Vec

In [20]:
model.most_similar(positive=['study'], topn = 5)

[('studies', 0.7211395502090454),
 ('Study', 0.6836080551147461),
 ('survey', 0.6731963753700256),
 ('researchers', 0.6250616312026978),
 ('research', 0.6238202452659607)]

In [21]:
model.most_similar(positive=['intervention'], topn = 5)

[('intervene', 0.5795313119888306),
 ('intervening', 0.5319258570671082),
 ('intervened', 0.4465080797672272),
 ('assistance', 0.42715123295783997),
 ('mobilization', 0.3966519832611084)]

In [22]:
model.most_similar(positive=['review'], topn = 5)

[('reviewed', 0.6630408763885498),
 ('reviewing', 0.6609559059143066),
 ('reviews', 0.6379557251930237),
 ('evaluation', 0.6035541296005249),
 ('assessment', 0.5333109498023987)]

In [23]:
model.most_similar(positive=['patient'], topn = 5)

[('patients', 0.7280630469322205),
 ('physicians', 0.6102133393287659),
 ('physician', 0.6056575775146484),
 ('clinical', 0.550094485282898),
 ('surgical', 0.5453905463218689)]

In [24]:
model.most_similar(positive=['suicide'], topn = 5)

[('suicides', 0.6633865833282471),
 ('homicide', 0.513701856136322),
 ('murder', 0.49831241369247437),
 ('death', 0.47814804315567017),
 ('murders', 0.4759059548377991)]

##### WordNet

In [25]:
wn.synsets('study', wn.NOUN)[0:3]

[Synset('survey.n.01'), Synset('study.n.02'), Synset('report.n.01')]

In [26]:
wn.synsets('intervention', wn.NOUN)[0:3]

[Synset('intervention.n.01'),
 Synset('intervention.n.02'),
 Synset('interposition.n.02')]

In [27]:
wn.synsets('review', wn.NOUN)[0:3]

[Synset('reappraisal.n.01'), Synset('review.n.02'), Synset('follow-up.n.03')]

In [28]:
wn.synsets('patient', wn.NOUN)[0:3]

[Synset('patient.n.01'), Synset('affected_role.n.01')]

In [29]:
wn.synsets('suicide', wn.NOUN)[0:3]

[Synset('suicide.n.01'), Synset('suicide.n.02')]

#### Adjectives

##### Word2Vec

In [30]:
model.most_similar(positive=['systematic'], topn = 5)

[('systematically', 0.5535138845443726),
 ('methodical', 0.5095993876457214),
 ('deliberate', 0.5030157566070557),
 ('thorough', 0.4970399737358093),
 ('systematized', 0.4735804498195648)]

In [31]:
model.most_similar(positive=['psychological'], topn = 5)

[('mental', 0.6324292421340942),
 ('psychologically', 0.6209654211997986),
 ('emotional', 0.5921927690505981),
 ('physiological', 0.5291099548339844),
 ('physical', 0.5243164300918579)]

In [32]:
model.most_similar(positive=['medical'], topn = 5)

[('doctors', 0.6195696592330933),
 ('physician', 0.5994016528129578),
 ('Medical', 0.5921540856361389),
 ('physicians', 0.5749360918998718),
 ('dental', 0.5676089525222778)]

In [33]:
model.most_similar(positive=['anxious'], topn = 5)

[('eager', 0.722114622592926),
 ('fearful', 0.643116295337677),
 ('nervous', 0.6418448686599731),
 ('worried', 0.6220002174377441),
 ('impatient', 0.6138770580291748)]

In [34]:
model.most_similar(positive=['effective'], topn = 5)

[('Effective', 0.6084572076797485),
 ('efficient', 0.5855873823165894),
 ('ineffective', 0.5252822637557983),
 ('efficacious', 0.505646824836731),
 ('economical', 0.4952201247215271)]

##### WordNet

In [35]:
wn.synsets('systematic', wn.ADJ)[0:3]

[Synset('systematic.a.01'), Synset('taxonomic.a.01')]

In [36]:
wn.synsets('psychological', wn.ADJ)[0:3]

[Synset('psychological.s.01'), Synset('psychological.a.02')]

In [37]:
wn.synsets('medical', wn.ADJ)[0:3]

[Synset('medical.a.01'), Synset('medical.a.02'), Synset('aesculapian.a.01')]

In [38]:
wn.synsets('anxious', wn.ADJ)[0:3]

[Synset('anxious.s.01'), Synset('anxious.s.02')]

In [39]:
wn.synsets('effective', wn.ADJ)[0:3]

[Synset('effective.a.01'), Synset('effective.s.02'), Synset('effective.s.03')]

#### Verbs

##### Word2Vec

In [40]:
model.most_similar(positive=['treat'], topn = 5)

[('treating', 0.7725892663002014),
 ('treats', 0.687384843826294),
 ('treated', 0.673922598361969),
 ('Treat', 0.6268018484115601),
 ('treatment', 0.511717677116394)]

In [41]:
model.most_similar(positive=['intervene'], topn = 5)

[('intervened', 0.6794320940971375),
 ('intervention', 0.5795313715934753),
 ('intervening', 0.5529994368553162),
 ('meddle', 0.5135860443115234),
 ('respond', 0.48485830426216125)]

In [42]:
model.most_similar(positive=['study'], topn = 5)

[('studies', 0.7211395502090454),
 ('Study', 0.6836080551147461),
 ('survey', 0.6731963753700256),
 ('researchers', 0.6250616312026978),
 ('research', 0.6238202452659607)]

In [43]:
model.most_similar(positive=['review'], topn = 5)

[('reviewed', 0.6630408763885498),
 ('reviewing', 0.6609559059143066),
 ('reviews', 0.6379557251930237),
 ('evaluation', 0.6035541296005249),
 ('assessment', 0.5333109498023987)]

In [44]:
model.most_similar(positive=['cure'], topn = 5)

[('cures', 0.7962284088134766),
 ('curing', 0.6621212959289551),
 ('cured', 0.6239017844200134),
 ('remedy', 0.5670593976974487),
 ('antidote', 0.5099565982818604)]

##### WordNet

In [45]:
wn.synsets('treat', wn.VERB)[0:3]

[Synset('treat.v.01'), Synset('process.v.01'), Synset('treat.v.03')]

In [46]:
wn.synsets('intervene', wn.VERB)[0:3]

[Synset('intervene.v.01'), Synset('intervene.v.02'), Synset('intervene.v.03')]

In [47]:
wn.synsets('study', wn.VERB)[0:3]

[Synset('analyze.v.01'), Synset('study.v.02'), Synset('study.v.03')]

In [48]:
wn.synsets('review', wn.VERB)[0:3]

[Synset('review.v.01'), Synset('review.v.02'), Synset('review.v.03')]

In [49]:
wn.synsets('cure', wn.VERB)[0:3]

[Synset('bring_around.v.02'), Synset('cure.v.02'), Synset('cure.v.03')]