In [5]:
import sys, os
import re, nltk
from nltk import pos_tag
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import sklearn, gensim
import Pyro4
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
os.chdir(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/data/')
from utils import preprocess_for_bow

In [6]:
data = preprocess_for_bow('data.csv')
text_data = data['text']
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data = [[token.strip() for token in tokenizer.tokenize(text)] for text in text_data]


## Tf-idf and document similarity

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
X.shape


(30191, 53784)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)

### k-means

In [5]:
from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(X)
clusters = km.labels_.tolist()
np.unique(np.array(clusters), return_counts=True)

  super()._check_params_vs_input(X, default_n_init=10)


(array([0, 1, 2, 3, 4]), array([21076,   685,  1682,  4753,  1995]))

### PCA

In [6]:
pca = PCA(n_components=10)  
#pca_result = pca.fit_transform(X.toarray())
#print("Explained Variance Ratio:", pca.explained_variance_ratio_)

### UMAP

In [7]:
#https://pypi.org/project/umap-learn/
umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3)
#umap_result = umap_model.fit_transform(X.toarray())

# Plot the UMAP visualization
#plt.scatter(umap_result[:, 0], umap_result[:, 1])
#plt.title("UMAP Visualization")
#plt.show()

# GENSIM

In [7]:
dictionary = Dictionary(documents=tokenized_data, prune_at=None)
corpus = [dictionary.doc2bow(seq) for seq in tokenized_data]

### lda

In [9]:
from gensim.models import LdaModel # https://radimrehurek.com/gensim/models/ldamodel.html
"""Online Learning for LDA: stochastic optim -> no need to retrain whole corpus when corpus augments -> data hold in memory
    'auto': how asymmetric priors learnt -> 
"""
lda = LdaModel(corpus, num_topics=10,
               id2word= dictionary,
               distributed=False,
               chunksize=2000, #training chunks
               decay=0.5, # rate at which previous lambda value is forgotten (0.5,1)
               passes=1, #training epochs
               update_every=1, #number of documents to be iterated through for each update (during model deployement: set 0 if only need batch training over given corpus)
               alpha='symmetric', #document/topic priors - array | symmetric=1/num_topics | 'asymmetric'=(topic_index + sqrt(num_topics)) | 'auto': learns asymmetric from corpus (need distributed set to True) 
               eta='symmetric', #topic-word  priors - shape (num_topics, num_words) or vector for equal priors accross words
                                #asymmetric and auto possible but equal distrib across words
               offset=1, #slow down first iter -> math:`\\tau_0` from `'Online Learning for LDA'
               eval_every=10, #log perplexity -> needed for auto ? 
               iterations=50, #maximum iter over corpus for inference 
               gamma_threshold=0.001, #minimum change in the value of the gamma parameters to continue iterating
               minimum_probability=0.01, #filter out topic prob lower than that
               random_state=None,
               ns_conf=None, #optional: for distributed learning
               minimum_phi_value=0.01, #lowerbound for topic/word
               per_word_topics=False, #if true: also return topic/words distrib when calling .get_document_topics(
               callbacks=None,
               dtype=np.float32)
            
topic_info = lda.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.026*"muslim" + 0.017*"offer" + 0.012*"migrant" + 0.010*"sister" + 0.010*"accuse" + 0.009*"com" + 0.009*"government" + 0.008*"endorse" + 0.008*"establishment" + 0.008*"error"'),
 (1,
  '0.027*"tweet" + 0.021*"add" + 0.016*"whitehouse" + 0.012*"area" + 0.011*"google" + 0.010*"hit" + 0.009*"want" + 0.009*"resign" + 0.009*"try" + 0.008*"mission"'),
 (2,
  '0.013*"faith" + 0.011*"capitol" + 0.010*"god" + 0.009*"employee" + 0.009*"charge" + 0.009*"novelcoronavirus" + 0.009*"christ" + 0.009*"hindu" + 0.008*"lay" + 0.008*"journalist"'),
 (3,
  '0.022*"covid" + 0.022*"show" + 0.015*"video" + 0.014*"case" + 0.013*"coronavirus" + 0.012*"death" + 0.011*"people" + 0.010*"woman" + 0.010*"year" + 0.010*"report"'),
 (4,
  '0.050*"show" + 0.029*"photo" + 0.029*"video" + 0.014*"protest" + 0.011*"india" + 0.010*"farmer" + 0.010*"world" + 0.008*"party" + 0.006*"modi" + 0.006*"secretary"'),
 (5,
  '0.019*"people" + 0.014*"covid" + 0.012*"document" + 0.010*"say" + 0.010*"mask" + 0.010*"time" + 0.0

In [28]:
lda.print_topics()

[(0,
  '0.026*"muslim" + 0.017*"offer" + 0.012*"migrant" + 0.010*"sister" + 0.010*"accuse" + 0.009*"com" + 0.009*"government" + 0.008*"endorse" + 0.008*"establishment" + 0.008*"error"'),
 (1,
  '0.027*"tweet" + 0.021*"add" + 0.016*"whitehouse" + 0.012*"area" + 0.011*"google" + 0.010*"hit" + 0.009*"want" + 0.009*"resign" + 0.009*"try" + 0.008*"mission"'),
 (2,
  '0.013*"faith" + 0.011*"capitol" + 0.010*"god" + 0.009*"employee" + 0.009*"charge" + 0.009*"novelcoronavirus" + 0.009*"christ" + 0.009*"hindu" + 0.008*"lay" + 0.008*"journalist"'),
 (3,
  '0.022*"covid" + 0.022*"show" + 0.015*"video" + 0.014*"case" + 0.013*"coronavirus" + 0.012*"death" + 0.011*"people" + 0.010*"woman" + 0.010*"year" + 0.010*"report"'),
 (4,
  '0.050*"show" + 0.029*"photo" + 0.029*"video" + 0.014*"protest" + 0.011*"india" + 0.010*"farmer" + 0.010*"world" + 0.008*"party" + 0.006*"modi" + 0.006*"secretary"'),
 (5,
  '0.019*"people" + 0.014*"covid" + 0.012*"document" + 0.010*"say" + 0.010*"mask" + 0.010*"time" + 0.0

In [24]:
dictionary[1]

'biologicalweapons'

### lda multicore

In [15]:
from gensim.models import LdaMulticore
lda = LdaMulticore(corpus=corpus, num_topics=10, 
                 id2word=dictionary, workers=None,
                 chunksize=2000, 
                 passes=1, 
                 batch=False, 
                 alpha='symmetric',
                 eta=None, 
                 decay=0.5, 
                 offset=1.0, 
                 eval_every=10, 
                 iterations=50,
                 gamma_threshold=0.001, 
                 random_state=None, 
                 minimum_probability=0.01,
                 minimum_phi_value=0.01, 
                 per_word_topics=False, 
                 dtype=np.float32)
topic_info = lda.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.019*"tweet" + 0.017*"show" + 0.012*"want" + 0.010*"add" + 0.010*"try" + 0.010*"birdwatch" + 0.009*"context" + 0.009*"hit" + 0.009*"mislead" + 0.009*"contribute"'),
 (1,
  '0.009*"people" + 0.006*"covid" + 0.005*"say" + 0.005*"coronavirus" + 0.004*"president" + 0.004*"show" + 0.004*"make" + 0.004*"vaccine" + 0.004*"use" + 0.003*"find"'),
 (2,
  '0.010*"covid" + 0.009*"show" + 0.006*"video" + 0.005*"trump" + 0.005*"vaccine" + 0.005*"president" + 0.005*"people" + 0.005*"death" + 0.004*"win" + 0.004*"get"'),
 (3,
  '0.011*"say" + 0.009*"trump" + 0.006*"covid" + 0.005*"people" + 0.005*"take" + 0.004*"state" + 0.004*"make" + 0.004*"show" + 0.004*"claim" + 0.003*"video"'),
 (4,
  '0.010*"covid" + 0.008*"vaccine" + 0.008*"know" + 0.008*"trump" + 0.007*"say" + 0.006*"people" + 0.006*"president" + 0.006*"get" + 0.004*"government" + 0.004*"take"'),
 (5,
  '0.012*"tweet" + 0.008*"covid" + 0.008*"trump" + 0.008*"get" + 0.006*"want" + 0.006*"try" + 0.006*"people" + 0.006*"birdwatch" + 0.00

In [52]:
lda.get_document_topics(corpus)[4]

[(2, 0.09766076),
 (3, 0.09164561),
 (5, 0.25104368),
 (7, 0.44094688),
 (9, 0.09481776)]

In [51]:
len(corpus)

30191

### HDP

In [9]:
from gensim.models import HdpModel # https://radimrehurek.com/gensim/models/hdpmodel.html
hdp = HdpModel(corpus, dictionary)
topic_info = hdp.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.022*tweet + 0.012*try + 0.011*add + 0.011*want + 0.011*context + 0.010*mislead + 0.009*hit + 0.009*contribute + 0.008*menu + 0.007*covid'),
 (1,
  '0.009*show + 0.007*trump + 0.006*covid + 0.006*people + 0.006*president + 0.005*say + 0.005*video + 0.005*vaccine + 0.004*election + 0.003*get'),
 (2,
  '0.007*show + 0.005*covid + 0.005*trump + 0.005*people + 0.004*say + 0.003*president + 0.003*video + 0.003*vaccine + 0.002*get + 0.002*state'),
 (3,
  '0.005*show + 0.004*trump + 0.004*covid + 0.003*people + 0.003*president + 0.003*say + 0.003*earthquake + 0.002*get + 0.002*video + 0.002*vaccine'),
 (4,
  '0.004*show + 0.004*covid + 0.003*trump + 0.003*president + 0.003*people + 0.002*say + 0.002*video + 0.002*coronavirus + 0.002*state + 0.002*vaccine'),
 (5,
  '0.004*covid + 0.003*trump + 0.003*show + 0.003*people + 0.003*say + 0.002*president + 0.002*get + 0.002*video + 0.002*vaccine + 0.001*coronavirus'),
 (6,
  '0.004*tweet + 0.003*show + 0.003*covid + 0.003*trump + 0.003*bird