In [1]:
import sys, os
import re, nltk
from nltk import pos_tag
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import sklearn, gensim
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
os.chdir(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/data/')
from utils import preprocess_for_bow

In [2]:
data = preprocess_for_bow('data.csv')
text_data = data['text']
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data = [[token.strip() for token in tokenizer.tokenize(text)] for text in text_data]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Tf-idf and document similarity

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
X.shape


(30191, 53784)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)

### k-means

In [5]:
from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(X)
clusters = km.labels_.tolist()
np.unique(np.array(clusters), return_counts=True)

  super()._check_params_vs_input(X, default_n_init=10)


(array([0, 1, 2, 3, 4]), array([21076,   685,  1682,  4753,  1995]))

### PCA

In [6]:
pca = PCA(n_components=10)  
#pca_result = pca.fit_transform(X.toarray())
#print("Explained Variance Ratio:", pca.explained_variance_ratio_)

### UMAP

In [7]:
#https://pypi.org/project/umap-learn/
umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3)
#umap_result = umap_model.fit_transform(X.toarray())

# Plot the UMAP visualization
#plt.scatter(umap_result[:, 0], umap_result[:, 1])
#plt.title("UMAP Visualization")
#plt.show()

# GENSIM

In [38]:
dictionary = Dictionary(documents=tokenized_data, prune_at=None)
corpus = [dictionary.doc2bow(seq) for seq in tokenized_data]

### lda

In [26]:
from gensim.models import LdaModel # https://radimrehurek.com/gensim/models/ldamodel.html
"""Online Learning for LDA: stochastic optim -> no need to retrain whole corpus when corpus augments -> data hold in memory
    'auto': how asymmetric priors learnt -> 
"""
lda = LdaModel(corpus, num_topics=10,
               id2word= dictionary,
               distributed=True,
               chunksize=2000, #training chunks
               decay=0.5, # rate at which previous lambda value is forgotten (0.5,1)
               passes=1, #training epochs
               update_every=1, #number of documents to be iterated through for each update (during model deployement: set 0 if only need batch training over given corpus)
               alpha='symmetric', #document/topic priors - array | symmetric=1/num_topics | 'asymmetric'=(topic_index + sqrt(num_topics)) | 'auto': learns asymmetric from corpus (need distributed set to True) 
               eta='symmetric', #topic-word  priors - shape (num_topics, num_words) or vector for equal priors accross words
                                #asymmetric and auto possible but equal distrib across words
               offset=1, #slow down first iter -> math:`\\tau_0` from `'Online Learning for LDA'
               eval_every=10, #log perplexity -> needed for auto ? 
               iterations=50, #maximum iter over corpus for inference 
               gamma_threshold=0.001, #minimum change in the value of the gamma parameters to continue iterating
               minimum_probability=0.01, #filter out topic prob lower than that
               random_state=None,
               ns_conf=None, #optional: for distributed learning
               minimum_phi_value=0.01, #lowerbound for topic/word
               per_word_topics=False, #if true: also return topic/words distrib when calling .get_document_topics(
               callbacks=None,
               dtype=np.float32)
            
topic_info = lda.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.029*"118" + 0.027*"165" + 0.027*"5164" + 0.024*"283" + 0.019*"30" + 0.016*"3158" + 0.014*"219" + 0.011*"498" + 0.010*"1651" + 0.010*"46"'),
 (1,
  '0.029*"69" + 0.025*"85" + 0.015*"30" + 0.014*"28" + 0.012*"151" + 0.007*"5111" + 0.007*"2786" + 0.007*"552" + 0.007*"11" + 0.007*"138"'),
 (2,
  '0.055*"167" + 0.046*"2465" + 0.029*"1456" + 0.014*"126" + 0.014*"148" + 0.013*"248" + 0.011*"6138" + 0.009*"7554" + 0.008*"5410" + 0.008*"2161"'),
 (3,
  '0.015*"8951" + 0.015*"1904" + 0.014*"2112" + 0.012*"1617" + 0.011*"8560" + 0.011*"935" + 0.011*"8900" + 0.010*"3425" + 0.010*"5786" + 0.010*"12565"'),
 (4,
  '0.029*"283" + 0.019*"2854" + 0.014*"271" + 0.012*"6623" + 0.010*"20863" + 0.009*"14286" + 0.009*"118" + 0.008*"165" + 0.008*"641" + 0.008*"935"'),
 (5,
  '0.025*"118" + 0.020*"6028" + 0.014*"151" + 0.012*"111" + 0.012*"69" + 0.011*"609" + 0.010*"25" + 0.010*"6451" + 0.009*"100" + 0.008*"935"'),
 (6,
  '0.019*"935" + 0.019*"118" + 0.017*"1059" + 0.014*"1735" + 0.013*"3257" + 0.010

need to call the index of corpus dictionary to retrieve the words

In [27]:
lda.get_document_topics(corpus[0], minimum_probability=0.001)

[(0, 0.29467753),
 (1, 0.6550206),
 (2, 0.0062861973),
 (3, 0.006286224),
 (4, 0.006286416),
 (5, 0.0062913466),
 (6, 0.0062875925),
 (7, 0.0062872106),
 (8, 0.006290098),
 (9, 0.0062868167)]

### lda multicore

In [15]:
from gensim.models import LdaMulticore
lda = LdaMulticore(corpus=corpus, num_topics=10, 
                 id2word=dictionary, workers=None,
                 chunksize=2000, 
                 passes=1, 
                 batch=False, 
                 alpha='symmetric',
                 eta=None, 
                 decay=0.5, 
                 offset=1.0, 
                 eval_every=10, 
                 iterations=50,
                 gamma_threshold=0.001, 
                 random_state=None, 
                 minimum_probability=0.01,
                 minimum_phi_value=0.01, 
                 per_word_topics=False, 
                 dtype=np.float32)
topic_info = lda.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.019*"tweet" + 0.017*"show" + 0.012*"want" + 0.010*"add" + 0.010*"try" + 0.010*"birdwatch" + 0.009*"context" + 0.009*"hit" + 0.009*"mislead" + 0.009*"contribute"'),
 (1,
  '0.009*"people" + 0.006*"covid" + 0.005*"say" + 0.005*"coronavirus" + 0.004*"president" + 0.004*"show" + 0.004*"make" + 0.004*"vaccine" + 0.004*"use" + 0.003*"find"'),
 (2,
  '0.010*"covid" + 0.009*"show" + 0.006*"video" + 0.005*"trump" + 0.005*"vaccine" + 0.005*"president" + 0.005*"people" + 0.005*"death" + 0.004*"win" + 0.004*"get"'),
 (3,
  '0.011*"say" + 0.009*"trump" + 0.006*"covid" + 0.005*"people" + 0.005*"take" + 0.004*"state" + 0.004*"make" + 0.004*"show" + 0.004*"claim" + 0.003*"video"'),
 (4,
  '0.010*"covid" + 0.008*"vaccine" + 0.008*"know" + 0.008*"trump" + 0.007*"say" + 0.006*"people" + 0.006*"president" + 0.006*"get" + 0.004*"government" + 0.004*"take"'),
 (5,
  '0.012*"tweet" + 0.008*"covid" + 0.008*"trump" + 0.008*"get" + 0.006*"want" + 0.006*"try" + 0.006*"people" + 0.006*"birdwatch" + 0.00

In [25]:
lda.get_document_topics(corpus[0], minimum_probability=0.001)

[(0, 0.0047817216),
 (1, 0.0047822436),
 (2, 0.0047825268),
 (3, 0.0047821915),
 (4, 0.004782601),
 (5, 0.0047822236),
 (6, 0.0047820755),
 (7, 0.9569583),
 (8, 0.0047830148),
 (9, 0.0047830935)]

### HDP

In [9]:
from gensim.models import HdpModel # https://radimrehurek.com/gensim/models/hdpmodel.html
hdp = HdpModel(corpus, dictionary)
topic_info = hdp.print_topics(num_topics=10, num_words=10)
topic_info

[(0,
  '0.022*tweet + 0.012*try + 0.011*add + 0.011*want + 0.011*context + 0.010*mislead + 0.009*hit + 0.009*contribute + 0.008*menu + 0.007*covid'),
 (1,
  '0.009*show + 0.007*trump + 0.006*covid + 0.006*people + 0.006*president + 0.005*say + 0.005*video + 0.005*vaccine + 0.004*election + 0.003*get'),
 (2,
  '0.007*show + 0.005*covid + 0.005*trump + 0.005*people + 0.004*say + 0.003*president + 0.003*video + 0.003*vaccine + 0.002*get + 0.002*state'),
 (3,
  '0.005*show + 0.004*trump + 0.004*covid + 0.003*people + 0.003*president + 0.003*say + 0.003*earthquake + 0.002*get + 0.002*video + 0.002*vaccine'),
 (4,
  '0.004*show + 0.004*covid + 0.003*trump + 0.003*president + 0.003*people + 0.002*say + 0.002*video + 0.002*coronavirus + 0.002*state + 0.002*vaccine'),
 (5,
  '0.004*covid + 0.003*trump + 0.003*show + 0.003*people + 0.003*say + 0.002*president + 0.002*get + 0.002*video + 0.002*vaccine + 0.001*coronavirus'),
 (6,
  '0.004*tweet + 0.003*show + 0.003*covid + 0.003*trump + 0.003*bird