In [1]:
from sklearn.decomposition import RandomizedPCA
import operator
from sklearn.feature_extraction.text import CountVectorizer
from pyspark.sql import SparkSession
import os
import urllib3 as urllib2
import google_compute_engine
import boto
import gensim 
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
import mpld3
#nltk.download('stopwords')
#nltk.download('punkt')

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
%time pubs = spark.read.json("/home/silas/final_project/pubs_data/s2-corpus-00")

CPU times: user 13.6 ms, sys: 265 µs, total: 13.9 ms
Wall time: 55.5 s


In [4]:
pubs.printSchema()
pubs.registerTempTable("pubs")
#sqlContext.sql("select paperAbstract from pubs").show()
pubs.select("paperAbstract").show()

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ids: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- doiUrl: string (nullable = true)
 |-- entities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- inCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journalName: string (nullable = true)
 |-- journalPages: string (nullable = true)
 |-- journalVolume: string (nullable = true)
 |-- outCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- paperAbstract: string (nullable = true)
 |-- pdfUrls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pmid: string (nullable = true)
 |-- s2PdfUrl: string (nullable = true)
 |-- s2Url: string (nullable = true)
 |-- sourc

In [5]:
pubs.head(1)

[Row(authors=[Row(ids=['2506899'], name='M. V. Ushakov')], doi='10.1023/A:1020916001559', doiUrl='https://doi.org/10.1023/A:1020916001559', entities=['Amphibians', 'Anura', 'Apache Gora', 'Aquatic ecosystem', 'Diazooxonorleucine', 'Habitat', 'Human body', 'Natural Selection', 'Natural Springs', 'Population', 'Rana esculenta', 'Rana temporaria'], id='7e58b926bbbc122edeccb7cb4f7f68ca11480698', inCitations=[], journalName='Russian Journal of Ecology', journalPages='446-451', journalVolume='33', outCitations=[], paperAbstract='The marsh frog is a widespread and flexible species that mainly occupies various aquatic biotopes. In the Lipetsk oblast, these frogs avoid only closed forest water bodies and springs, and their habitats in the Central Russian Upland and the Oka–Don Lowland obviously differ from each other. According to Klimov et al. (1999), the number of these amphibians in the Oka– Don Lowland is greater. The comparison of morphological variation in frogs from these regions shows t

In [6]:
abs_lst = pubs.sample(fraction=0.05, seed=3).select("title","paperAbstract","entities").limit(1000)

In [7]:
abs_pd_df = abs_lst.toPandas()

In [11]:
# note missing paper abstracts
abs_pd_df.paperAbstract[1]

'This study was undertaken to determine whether there was an association between fine particle (PM₂.₅) levels and daily clinic visits for migraine in Taipei, Taiwan. Daily clinic visits for migraine and ambient air pollution data for Taipei were obtained for the period from 2006-2011. The odds ratio of clinic visits was estimated using a case-crossover approach, controlling for weather variables, day of the week, seasonality, and long-term time trends. Generally, no significant associations between PM₂.₅ levels and migraine visits were observed on cool days. On warm days, however, for the single pollutant model (without adjustment for other pollutants), increased clinic visits for migraine were significantly associated with PM₂.₅ levels, with an interquartile range (IQR) rise associated with a 13% (95% CI = 8%-19%) elevation in number of migraine visits. In bi-pollutant model, PM₂.₅ remained significant after the inclusion of sulfur dioxide (SO₂) or ozone (O₃) on warm days. This study 

In [9]:
# get records with paper abstracts data completed
# df = sqlContext.createDataFrame(rdd, ["user_id", "object_id", "score"]
abstracts = abs_pd_df[abs_pd_df.paperAbstract != ''].paperAbstract
titles = abs_pd_df[abs_pd_df.paperAbstract != ''].title
entities = abs_pd_df[abs_pd_df.paperAbstract != ''].entities

In [None]:
# generates index for each item in the corpora (in this case it's just rank) and I'll use this for scoring later
ranks = []

for i in range(0,len(abstracts)):
    ranks.append(i)

In [None]:
# load nltk's English stopwords as variable called 'stopwords'

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# load nltk's SnowballStemmer as variabled 'stemmer'
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

In [None]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
def pre_processing_step(text):
    
    """
    Returns totalvocab_tokenized and totalvocab_stemmed
    """

    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in text:
        allwords_stemmed = tokenize_and_stem(i)
        totalvocab_stemmed.extend(allwords_stemmed)
    
        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    
    return totalvocab_tokenized, totalvocab_stemmed

In [None]:


totalvocab_stemmed = []
totalvocab_tokenized = []
for i in abstracts:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [None]:
# thisf-idf ttells how important a word is to a document in a collection or corpus
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(abstracts)

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()
terms

In [None]:
# Each term is notionally assigned a different dimension and a document is characterised 
# by a vector where the value in each dimension corresponds to 
# the number of times the term appears in the document. 
# Cosine similarity then gives a useful measure of how similar two documents are likely to be 
# in terms of their subject matter

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
dist.shape

In [None]:
# kmeans takes the tf-idf matrix and computes clusters/labels and centers

from sklearn.cluster import KMeans

num_clusters = 8

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()



In [None]:
from sklearn.externals import joblib

# joblib.dump(km,  'doc_cluster.pkl')
#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist() # length e.g. 1 for each doc

In [None]:
features = { 'title': titles, 'rank': ranks, 'abstracts': abstracts, 'clusters': clusters, 'entities': entities }

frame = pd.DataFrame(features, columns = ['rank', 'title', 'clusters', 'entities'])

#frame = pd.DataFrame({'rank':ranks, 'title': titles, 'entities':entities, 'cluster':clusters})

frame['clusters'].value_counts()
frame['cluster'] = frame['clusters']
frame = frame.set_index('cluster')
frame[['rank', 'title', 'clusters', 'entities']].head()

In [None]:
# here i make titles for the clusters by term frequencies
final_title_lst=[]
for i in range(len(set(frame.clusters))):
    texts = list(frame[frame.clusters == i].title)
    #texts = ['hi there', 'hello there', 'hello here you are']
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    freq = np.ravel(X.sum(axis=0))
    # get vocabulary keys, sorted by value
    vocab = [v[0] for v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]
    fdist = dict(zip(vocab, freq)) # return same format as nltk
    fdist_sort = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True)
    temp_lst=[]
    for i,j in enumerate(fdist_sort):
        if j[0] not in stopwords and j[1] > 2 and j[1] < 6:    # remove stopwords and get terms that aren't too frequent 
            temp_lst.append(j[0])
            #print(temp_lst)
        
            
    final_title_lst.append(temp_lst[2:6])  # select n terms for cluster titles
    
    
    
final_title_lst

In [None]:
grouped = frame['rank'].groupby(frame['clusters'])

grouped.mean()

In [None]:
vocab_frame.loc[terms[2].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster {} words:".format(i), end='')
    for ind in order_centroids[i, :num_clusters+1]:
        print(' {}'.format(vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')), end=',')
    print("")
    print()
    print("Cluster {} titles:".format(i), end='')
    for title in frame.loc[i]['title'].head().values.tolist():
        print(' %s,' % title, end='')
    print("\n")

In [None]:
frame['Rank'] = frame['rank'] + 1
frame['Title'] = frame['title']

In [None]:
# print(frame[['Rank', 'Title']].loc[frame['clusters'] == 1].to_html(index=False))

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

x_mds, y_mds = pos[:, 0], pos[:, 1]

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import RandomizedPCA

pca = PCA(n_components=2)
pos_pca = pca.fit_transform(dist)
x_pca, y_pca = pos_pca[:,0],pos_pca[:,1]
# print(pca.explained_variance_ratio_)  
# print(pca.singular_values_)  
#plt.plot(np.cumsum(pca.explained_variance_ratio_))
#plt.xlabel('n components')
#plt.ylabel('cumulative variance');

In [None]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [None]:
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e',
                  5: '#d85f02', 6: '#6570b3', 7: '#e6298a', 8: '#56a61e', 9: '#e9a61e'}

In [None]:
dicts= {}
for i,j in enumerate(final_title_lst):
    dicts[i] = ", ".join(j)

In [None]:
cluster_names = dicts

In [None]:
'''
#set up cluster names using a dict
cluster_names = {0: 'Family, home, war', 
                 1: 'Police, killed, murders', 
                 2: 'Father, New York, brothers', 
                 3: 'Dance, singing, love', 
                 4: 'Killed, soldiers, captain'}
'''

In [None]:
%matplotlib inline

In [None]:
# set data for plots to MDS or PCA transformed
x,y=x_mds,y_mds
# x,y=x_pca,y_pca     # unhash for PCA transformed

In [None]:
def 


#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=x, y=y, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 15)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['title'][:20], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=x, y=y, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 15)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['title'][:20], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [None]:
plt.close()

In [None]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [None]:
# number of samples for plotting


#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=x, y=y, label=clusters, title=titles)) 

#group by cluster, with options to sample 
# n = 300  
#groups = df.sample(n).groupby('label')
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,10)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1,loc='lower left') #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
# html = mpld3.fig_to_html(fig)
#print(html)

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

n = 50 # number of abstracts to structure
idx = np.random.choice(dist.shape[0], size=n, replace=False)

linkage_matrix = ward(dist[idx]) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=list(titles)[:100]);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

In [None]:
#strip any proper names from a text...unfortunately right now this is yanking the first word from a sentence too.
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [None]:
# LDA
# https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

from gensim import corpora, models, similarities 

#remove proper names
preprocess = [strip_proppers(doc) for doc in abstracts]

%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]



In [None]:
dictionary = corpora.Dictionary(texts)

dictionary.filter_extremes(no_below=1, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]


%time lda = models.LdaModel(corpus, num_topics=20, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

lda.print_topics(20, num_words=20)

lda.show_topics(formatted=False, num_words=20)

lda.get_topics().shape

topics_matrix = lda.show_topics(formatted=False, num_words=20)

### Prediction

In [None]:
# kmeans requires turning the test doc to tfidf_matrix
km.predict