In [None]:
import numpy
import spacy
from spacy.parts_of_speech import DET, ADP, CONJ, PUNCT, SPACE
from gensim import corpora, models, similarities
import time
import datetime
# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib as mpl
sns.set_style('whitegrid')
sns.set_context('poster')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# NLP 

In [None]:
from mendeley_import import iterate_db
nlp = spacy.en.English()

In [None]:
id_list, abstract_list, title_list, added_list = [],[],[],[]
for id, abstract, title, added, modified, doi, arxivId, citationKey, pmid, year in iterate_db():

    somePhrases = [DET, ADP, CONJ, PUNCT, SPACE]
    if abstract:
        tokens = nlp(abstract)
        # make them cannonical form (singlular vs plural etc) and filter out articles etc
        parsed_abstract = [t.lemma_ for t in tokens if t.pos not in somePhrases and not t.is_stop]
    else:
        parsed_abstract = []
        continue  # skiping ones with no abstract

    if title:
        tokens = nlp(title)
        parsed_title = [t.lemma_ for t in tokens if t.pos not in somePhrases]

    # process teh time stamp: if added on 6/Feb/2015 (backup restored) take the 'modified' otherwise the 'added'
    feb6 = datetime.date(2015,2,6)
    current_added = datetime.date.fromtimestamp(added//1000)  # /1000 to get rid of milisecs
    current_mod = datetime.date.fromtimestamp(modified//1000)
    timestamp = modified if current_added == feb6 and current_mod < current_added else added
    
#     timestamp = added

    id_list.append(id)
    abstract_list.append(parsed_abstract)
    title_list.append(parsed_title)
    added_list.append(timestamp)

docs = abstract_list

In [None]:
print("%d papers loaded" % len(docs))

## plotting time evolution of papercounts

In [None]:
%matplotlib

time_array = np.array(added_list)

q,w = np.histogram(time_array, bins=72)
plt.plot(w[1:],q, 'k')
plt.xlim(min(time_array), max(time_array))
plt.ylim(0,200)
locs,xticklabels = zip(*[(datetime.datetime(_,1,1).timestamp()*1000, _) for _ in [2011,2012,2013,2014,2015,2016,2017]])
plt.xticks(locs, xticklabels)
plt.xlabel('Time')
plt.ylabel('#papers')

In [None]:
"apply bigrams"
bigram = models.Phrases(docs, min_count=5)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

# Buliding the dictionary/corpus

In [None]:
# the dictionary//features
# remove rare/comon tokens
dictionary = corpora.Dictionary(docs)

import toolz.dicttoolz as dt
# whats the most frequent workd
dictionary.dfs  # the #documents containing a word not work vount in total
freq = dt.valfilter(lambda x: x>200, dictionary.dfs)
print([(dictionary[k],v) for k,v in freq.items() if v/dictionary.num_docs> 0.25])

In [None]:
dictionary.filter_extremes(no_below=2, no_above=0.25)

## BoW transform
corpus = [dictionary.doc2bow(doc) for doc in docs]
corpora.MmCorpus.serialize('corpus_abstracts.mm', corpus)

print(corpus)
# tfidf = models.TfidfModel(corpus)

# LDA

In [None]:
corpus = corpora.MmCorpus('corpus_abstracts.mm')
print(corpus)

# Set training parameters.
num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [None]:
"""
LDA
"""
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = models.LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize,
                       alpha='auto', eta='auto',
                       iterations=iterations, num_topics=num_topics,
                       passes=passes, eval_every=eval_every)

"eval the topics"
top_topics = model.top_topics(corpus, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
model.save('myLDA.pkl')
corpus.save('myCorpus.pkl')
dictionary.save('myDict.pkl')

In [None]:
model = models.LdaModel.load('myLDA.pkl', mmap='r')
corpus = corpora.MmCorpus('corpus_abstracts.mm')
dictionary = corpora.Dictionary.load('myDict.pkl')

## Visualization

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
Q = pyLDAvis.gensim.prepare(model, corpus, dictionary)
# ldatopics = model.show_topics(formatted=False)

In [None]:
pyLDAvis.display(Q )

## Topics over time

In [None]:
"""
using the timestamp, look at the prevalance of each topic over time
"""
n_bins = 10
time_bins = np.linspace(min(added_list),max(added_list), n_bins)
dt = time_bins[1]- time_bins[0]
topics_over_time = np.zeros((num_topics, n_bins))
docs_per_time = np.zeros(n_bins)

docs_array = np.array(docs) # convert to array for easier indexing
added_array = np.array(added_list)  # convert to array for easier indexing

for i,t in enumerate(time_bins):
    relevant_doc_ix = np.where(np.logical_and(added_array < t+dt, added_array>=t ))[0]
    n_docs = len(relevant_doc_ix)

    print("time %d: #docs: %d" % (i, n_docs))

    rel_docs = docs_array[relevant_doc_ix].tolist()
    BoW = [dictionary.doc2bow(_) for _ in rel_docs]
    topics_per_doc = [model.get_document_topics(_) for _ in BoW] # each entry is a tuple

    for topic_prob_tuple in topics_per_doc:
        for topic_id, prob in topic_prob_tuple:
            topics_over_time[topic_id,i] =  topics_over_time[topic_id,i] + prob

    docs_per_time[i] += n_docs
    
marginals_topic = topics_over_time.sum(1)/topics_over_time.sum()

In [None]:
normed_topic_per_time = (topics_over_time/(docs_per_time+1)).T
plt.figure()
for i in range(num_topics):
    j= i//5
    plt.subplot(2,2,j+1)
    plt.plot(time_bins, normed_topic_per_time[:,i])
             
    plt.ylim([0, 1])
    locs,xticklabels = zip(*[(datetime.datetime(_,1,1).timestamp()*1000, _) for _ in [2011,2012,2013,2014,2015,2016,2017]])
    plt.xticks(locs, xticklabels)
    plt.xlabel('Time')
    plt.ylabel('#papers/topic')

plt.legend(range(5))
# plt.matshow((topics_over_time/(docs_per_time+1)).T)

             
plt.show()

In [None]:
# plot only the top10 (marignal distr) topics
ix_sort = np.array(list(reversed(np.argsort(marginals_topic))))  # most probable at tehe front

In [None]:
cmap = [plt.cm.Accent(_) for _ in np.linspace(0,1,8)]

In [None]:
print(plt.style.available)
plt.style.use('dark_background')
mpl.rcParams['grid.linewidth'] = 0.3

In [None]:
leg = []
for i in range(8):
    current_topicID = ix_sort[i]
    plt.plot(time_bins, normed_topic_per_time[:,current_topicID], color=cmap[i])
    plt.ylim([0, 1])
    locs,xticklabels = zip(*[(datetime.datetime(_,1,1).timestamp()*1000, _) for _ in [2011,2012,2013,2014,2015,2016,2017]])
    plt.xticks(locs, xticklabels)
    plt.xlabel('Time')
    plt.ylabel('P(topic)')
    
    tmp_leg = "-".join(list(zip(*model.show_topic(current_topicID)))[0][:4])
    leg.append(tmp_leg)
plt.legend(leg)
plt.ylim([0,0.6])
plt.show()

In [None]:
normed_topic_per_time[0,:]

In [None]:
model.show_topic(4,topn=100)

# Dynamic topic model

In [None]:
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus

In [None]:
time_slice = docs_per_time.astype('int').tolist()

In [None]:
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=5, lda_model=model)