In [24]:
import pandas as pd
import re
from ast import literal_eval
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk

nltk.download('wordnet')
np.random.seed(2018)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shivarjun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
mentions_tweets = pd.read_csv(r"selenium_tweets.csv")#,converters={"complete_urls": literal_eval, "about": literal_eval})
mentions_tweets.drop_duplicates(subset=['text'],inplace=True)

In [26]:
def lemmatize_stemming(text):
    return  WordNetLemmatizer().lemmatize(text, pos='v') #SnowballStemmer.stem

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [28]:
processed_docs = mentions_tweets['text'].astype('str').map(preprocess)
processed_docs[:3]

0    [antidote, fauci, covid, fauci, cdcgov, demoni...
1        [jack, abort, fetuses, aren, people, medical]
2                                               [mike]
Name: text, dtype: object

In [29]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

[(2, 1),
 (7, 1),
 (163, 1),
 (210, 1),
 (356, 1),
 (577, 1),
 (628, 1),
 (762, 1),
 (1000, 1),
 (1072, 1),
 (1138, 1),
 (1712, 1)]

In [30]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# Running LDA using Bag of Words

In [31]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=10, workers=3,alpha =0.1, eta=0.1,
                                      random_state=10)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.016*"vaccine" + 0.010*"say" + 0.009*"like" + 0.009*"people" + 0.008*"twitter" + 0.008*"want" + 0.007*"covid" + 0.007*"biden" + 0.007*"mandate" + 0.006*"know"
Topic: 1 
Words: 0.038*"covid" + 0.029*"twitter" + 0.020*"people" + 0.019*"like" + 0.015*"vaccinate" + 0.010*"vaccine" + 0.009*"deaths" + 0.009*"die" + 0.008*"case" + 0.007*"mask"


# Running LDA using TF-IDF

In [33]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=2, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"covid" + 0.007*"people" + 0.007*"twitter" + 0.007*"vaccine" + 0.006*"vaccinate" + 0.005*"know" + 0.005*"say" + 0.005*"like" + 0.005*"think" + 0.004*"die"
Topic: 1 Word: 0.008*"like" + 0.008*"covid" + 0.006*"twitter" + 0.006*"people" + 0.006*"vaccine" + 0.005*"want" + 0.004*"mask" + 0.004*"vaccinate" + 0.004*"need" + 0.004*"get"


In [23]:
bigram = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(bigram)