## Topic Modelling with the GENSIM Module
https://radimrehurek.com/gensim/

### Read in Document
Using a sample file of tweets, extracting only the tweet content

In [99]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#references
#https://radimrehurek.com/gensim
#http://chdoig.github.io/pygotham-topic-modeling

import csv
import numpy as np
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
import pprint as pp
import re

from gensim import corpora, models, similarities
from collections import defaultdict
from collections import OrderedDict

#enable logging    
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO,
                    filename='python_log.txt')
    
    
    
#sample CSV file containing tweet content
vaccination_list = []
with open('vaccination_sentiment_final.csv', 'rb') as csvfile:
    vaccination_sentiment_final = csv.DictReader(csvfile)
    for i in vaccination_sentiment_final:
        vaccination_list.append(i)

#create list of only tweet content
vaccination_text = []
for i in vaccination_list:
    vaccination_text.append(i['blog_text'])

#inspect sample
for i in vaccination_text[1:10]:
    pp.pprint(i)
    
    

'#         gatesfoundation  a medical miracle  dr  salks polio vaccine turns      #vaccineswork pic twitter com ekdxlhjgpx'
'#         onecampaign  photos   kids in rwanda receive vaccines  thanks to  gavi  #vaccineswork pic twitter com wgk  nbths'
'#  # #antivaccination movement gains traction in #turkey'
'#  minutes truth #measlestruth information from nys doh   healthnygov  ny aap  childhealthusa  healthychildren'
'#  minutes truth #measlestruth pic twitter com  yoqgjw jj'
'# ###vaccineswork  pic twitter com ecyzvg yx'
'# doses of #hepatitisbvax~  #over months~gives #lifelongprotection       #worldhepatitisday #vaccineswork pic twitter com uw wfo lzm'
'# in  kids is on #autism spectrum  thats a lot of #activistparents   elizabethmccra   kath cats  doritmi  justthevax  sciencecomic #cdcfraud'
'# in  kids isnt enough for you  keep calling every #vaxinjury #anectodal  kolyin  emccra   smcgregorking  doritmi  creativecarissa #sb'


### Basic word frequency

In [100]:
# remove common words and tokenize (by word) - add to stopwords as required
stoplist = set('for a of the and to in i are com is that it this www pic rt'.split())
texts = [[word for word in tweet.lower().split() if word not in stoplist]
          for tweet in vaccination_text]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
  
# create ordered dict
frequency = OrderedDict(sorted(frequency.items(), key=lambda t: t[1], reverse=True))

#inspect top 10
pp.pprint(frequency.items()[0:10])

[('#vaccineswork', 4819),
 ('#sb', 2806),
 ('vaccine', 2763),
 ('you', 2620),
 ('on', 2232),
 ('vaccines', 2040),
 ('more', 1867),
 ('not', 1781),
 ('twitter', 1715),
 ('be', 1684)]


### Toeknise and Serialise

In [101]:
# create dictionary of tokenised words
dict_tweets = corpora.Dictionary(texts)

# save dict
dict_tweets.save('vaccination_tweet_dictionary.dict')

# view corpus summary
print(dict_tweets)

# count for tokens
#print(dict_tweets.token2id)

#take new tweet and create a vector 'Bag of Words' representation - only counts against words present in the dictionary
new_tweet = 'in  kids is on #autism spectrum  thats thats a lot lot lot of #activistparents   elizabethmccra   kath cats  doritmi  justthevax  sciencecomic'
new_tweet_vec = dict_tweets.doc2bow(new_tweet.lower().split())
pp.pprint(new_tweet_vec)


# create tokenised, serialised corpus
corpus_tweets = [dict_tweets.doc2bow(text) for text in texts]

corpora.MmCorpus.serialize('corpus_tweets.mm', corpus_tweets)

# inspect
print(corpus_tweets[1:10])

Dictionary(22551 unique tokens: [u'binnsteryorkie', u'gag', u'woods', u'spiders', u'francesca']...)
[(16, 1),
 (56, 1),
 (57, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 3),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 2)]
[[(0, 1), (2, 1), (3, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(0, 1), (2, 1), (3, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)], [(0, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)], [(0, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(0, 1), (3, 1), (35, 1), (38, 1), (40, 1), (42, 1), (43, 1)], [(0, 1), (3, 1), (44, 1), (45, 1), (46, 1)], [(0, 1), (2, 1), (3, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)], [(0, 1), (16, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)], [(0, 1), (16, 1), (59, 1),

### Corpus Object - Memory-friendly

In [103]:
# memory friendly corpus

class MyCorpus(object):
    def __iter__(self):
        for line in open('vaccination_sentiment_final.csv'):
            # assume there's one document per line, tokens separated by whitespace
            yield dict_tweets.doc2bow(line.lower().split())


corpus_tweets_stream = MyCorpus()

# stream and iterate corpus
#for vector in corpus_tweets_streem: # load one vector into memory at a time
#    print(vector)


### Transformations
- Inverse Document Frequency IDF
- Latent Semantic Indixing LSI
- Latent Dirichlet Allocation LDA

In [104]:
# train IDF model
tfidf = models.TfidfModel(corpus_tweets_stream)

# apply model to new vector 'bag of words' eg,
# new_tweet = 'in  kids is on #autism spectrum  thats thats a lot lot lot of #activistparents   elizabethmccra   kath cats  doritmi  justthevax  sciencecomic'
# new_tweet_vec = dict_tweets.doc2bow(new_tweet.lower().split())
corpus_tfidf = tfidf[corpus_tweets]

# print idf for selection
#for t in corpus_tfidf[1:10]:
#    pp.pprint(t)
    

# LATENT SEMANTIC INDEXING LSI
# serialize model transformation over the topic - latent semantic indexing
lsi = models.LsiModel(corpus_tfidf, 
                      id2word=dict_tweets, 
                      num_topics=15) # initialize an LSI transformation

corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi


# LATENT DIRICHLET ALLOCATION LDA
# train an LDA model
lda = models.LdaModel(corpus_tfidf, 
                          id2word=dict_tweets, 
                          num_topics=100)


corpus_lda = lda[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

MODEL PERSISTENCY - save it!
lsi.save('model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('model.lsi')



### Inspect Topics Generated by Models

In [107]:
#inspect topics

#LSI - print words in each topic + weights
#lsi.print_topics(num_topics=15, 
#                 num_words=20)

#LSI - print words in each topic + weights
lda.show_topics(num_topics=100,
                num_words=20)

#view weights for each document
#for doc in corpus_lsi[1:10]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print(doc)


[(0,
  u'0.035*healthy + 0.024*pathtweets + 0.022*role + 0.019*vaxcalc + 0.014*update + 0.014*tom + 0.014*police + 0.012*hide + 0.012*kirstiealley + 0.011*#informedconsent + 0.010*#anonymous + 0.009*melissa + 0.009*va + 0.009*jimcarrey + 0.007*updates + 0.007*keep + 0.006*#autismspeaks + 0.006*gateway + 0.006*scenes + 0.006*acting'),
 (1,
  u'0.014*todays + 0.009*parents + 0.009*should + 0.008*politicians + 0.008*locations + 0.008*we + 0.008*vaccine + 0.008*risks + 0.008*saved + 0.007*#vaccineswork + 0.007*so + 0.007*nbc + 0.007*times + 0.007*ago + 0.007*these + 0.006*because + 0.006*#sb + 0.006*you + 0.006*demand + 0.006*vaccination'),
 (2,
  u'0.038*article + 0.038*x + 0.033*here + 0.033*p + 0.031*wp + 0.030*ar + 0.028*me + 0.028*http + 0.027*see + 0.026*~ + 0.014*#cnndebate + 0.013*sign + 0.013*petition + 0.013*true + 0.011*gets + 0.009*~read + 0.009*last + 0.008*links + 0.007*please + 0.007*guest'),
 (3,
  u'0.039*#niam + 0.026*proof + 0.026*needed + 0.021*post + 0.020*strong + 0.0

### Similarity
- assess new documents

In [None]:

#create index for the corpus you want to compare similarity - using LSI model here
index = similarities.MatrixSimilarity(lsi[corpus_tweets]) # transform corpus to LSI space and index it

#persistency of index created
index.save('index_tweets.index')
index = similarities.MatrixSimilarity.load('index_tweets.index')

#check similarity of index - run LSI model against BOW tokenised document
new_tweet_vec_lsi = lsi[new_tweet_vec]

sims = index[new_tweet_vec] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples