In [None]:
!python -m spacy download en_core_web_sm

In [14]:
import pandas as pd
misinfo_transcripts = pd.read_csv('misinfo_trans_with_punctuation_cleaned.csv')
info_transcripts = pd.read_csv('info_trans_with_punctuation_cleaned.csv')

In [15]:
import spacy
nlp = spacy.load("en_core_web_trf")

def preprocess_text(text):
    # Apply spaCy pipeline
    doc = nlp(text)
    # Tokenize, remove stop words, and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [16]:
processed_misinfo_trans = misinfo_transcripts['transcript_with_punctuation'].apply(preprocess_text)
processed_info_trans = info_transcripts['transcript_with_punctuation'].apply(preprocess_text)

In [None]:
#from sklearn.feature_extraction.text import CountVectorizer

#vectorizer = CountVectorizer(preprocessor=preprocess_text)
#dtm = vectorizer.fit_transform(processed_trans)
#dtm

In [None]:
#dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
#dtm_df

In [17]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

In [18]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_misinfo_trans = lemmatization(processed_misinfo_trans)
lemmatized_info_trans = lemmatization(processed_info_trans)
#print (lemmatized_texts[0][0:90])

In [19]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

misinfo_data_words = gen_words(lemmatized_misinfo_trans)
info_data_words = gen_words(lemmatized_info_trans)

In [20]:
#Bigrams and trigrams
import gensim.models

misinfo_bigram_phrases = gensim.models.Phrases(misinfo_data_words, min_count=5, threshold=100)
misinfo_trigram_phrases = gensim.models.Phrases(misinfo_bigram_phrases[misinfo_data_words], threshold=100)

misinfo_bigram = gensim.models.phrases.Phraser(misinfo_bigram_phrases)
misinfo_trigram = gensim.models.phrases.Phraser(misinfo_trigram_phrases)

def make_misinfo_bigrams(texts):
    return([misinfo_bigram[doc] for doc in texts])

def make_misinfo_trigrams(texts):
    return ([misinfo_trigram[misinfo_bigram[doc]] for doc in texts])

misinfo_data_bigrams = make_misinfo_bigrams(misinfo_data_words)
misinfo_data_bigrams_trigrams = make_misinfo_trigrams(misinfo_data_bigrams)



info_bigram_phrases = gensim.models.Phrases(info_data_words, min_count=5, threshold=100)
info_trigram_phrases = gensim.models.Phrases(info_bigram_phrases[info_data_words], threshold=100)

info_bigram = gensim.models.phrases.Phraser(info_bigram_phrases)
info_trigram = gensim.models.phrases.Phraser(info_trigram_phrases)

def make_info_bigrams(texts):
    return([info_bigram[doc] for doc in texts])

def make_info_trigrams(texts):
    return ([info_trigram[info_bigram[doc]] for doc in texts])

info_data_bigrams = make_info_bigrams(info_data_words)
info_data_bigrams_trigrams = make_info_trigrams(info_data_bigrams)


In [21]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

misinfo_id2word = corpora.Dictionary(misinfo_data_bigrams_trigrams)

misinfo_texts = misinfo_data_bigrams_trigrams

misinfo_corpus = [misinfo_id2word.doc2bow(text) for text in misinfo_texts]
# print (corpus[0][0:20])

misinfo_tfidf = TfidfModel(misinfo_corpus, id2word=misinfo_id2word)

misinfo_low_value = 0.03
misinfo_words  = []
misinfo_words_missing_in_tfidf = []
for i in range(0, len(misinfo_corpus)):
    misinfo_bow = misinfo_corpus[i]
    misinfo_low_value_words = [] #reinitialize to be safe. You can skip this.
    misinfo_tfidf_ids = [id for id, value in misinfo_tfidf[misinfo_bow]]
    misinfo_bow_ids = [id for id, value in misinfo_bow]
    misinfo_low_value_words = [id for id, value in misinfo_tfidf[misinfo_bow] if value < misinfo_low_value]
    misinfo_drops = misinfo_low_value_words+misinfo_words_missing_in_tfidf
    for item in misinfo_drops:
        misinfo_words.append(misinfo_id2word[item])
    misinfo_words_missing_in_tfidf = [id for id in misinfo_bow_ids if id not in misinfo_tfidf_ids] # The words with tf-idf socre 0 will be missing

    misinfo_new_bow = [b for b in misinfo_bow if b[0] not in misinfo_low_value_words and b[0] not in misinfo_words_missing_in_tfidf]
    misinfo_corpus[i] = misinfo_new_bow



info_id2word = corpora.Dictionary(info_data_bigrams_trigrams)

info_texts = info_data_bigrams_trigrams

info_corpus = [info_id2word.doc2bow(text) for text in info_texts]
# print (corpus[0][0:20])

info_tfidf = TfidfModel(info_corpus, id2word=info_id2word)

info_low_value = 0.03
info_words  = []
info_words_missing_in_tfidf = []
for i in range(0, len(info_corpus)):
    info_bow = info_corpus[i]
    info_low_value_words = [] #reinitialize to be safe. You can skip this.
    info_tfidf_ids = [id for id, value in info_tfidf[info_bow]]
    info_bow_ids = [id for id, value in info_bow]
    info_low_value_words = [id for id, value in info_tfidf[info_bow] if value < info_low_value]
    info_drops = info_low_value_words+info_words_missing_in_tfidf
    for item in info_drops:
        info_words.append(info_id2word[item])
    info_words_missing_in_tfidf = [id for id in info_bow_ids if id not in info_tfidf_ids] # The words with tf-idf socre 0 will be missing

    info_new_bow = [b for b in info_bow if b[0] not in info_low_value_words and b[0] not in info_words_missing_in_tfidf]
    info_corpus[i] = info_new_bow


In [None]:
#id2word = corpora.Dictionary(data_words)

#corpus = []
#for text in data_words:
#    new = id2word.doc2bow(text)
#    corpus.append(new)

#print (corpus[0][0:20])

#word = id2word[[0][:1][0]]
#print (word)

In [50]:
misinfo_lda_model = gensim.models.ldamodel.LdaModel(corpus=misinfo_corpus,
                                           id2word=misinfo_id2word,
                                           num_topics=50,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [51]:
pyLDAvis.enable_notebook()
misinfo_vis = pyLDAvis.gensim.prepare(misinfo_lda_model, misinfo_corpus, misinfo_id2word, mds="mmds", R=30)
misinfo_vis

In [52]:
info_lda_model = gensim.models.ldamodel.LdaModel(corpus=info_corpus,
                                           id2word=info_id2word,
                                           num_topics=50,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [53]:
pyLDAvis.enable_notebook()
info_vis = pyLDAvis.gensim.prepare(info_lda_model, info_corpus, info_id2word, mds="mmds", R=30)
info_vis