In [None]:
import pandas as pd
import numpy as np

import nltk
import spacy

from gensim.models import Phrases
from gensim.models.phrases import Phraser 
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

import pyLDAvis.gensim_models
import pyLDAvis

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("frame2.csv")

In [None]:
df['words'] = df['words'].str.replace("'", "")

In [None]:
# Only take English transcripts
df = df[df.language == 'en']
df.language.value_counts()

### Clean text (get tri- and bi-grams, lemmatize, and take only selected POS)

In [None]:
# Build bigram and trigram Phrases objects
bigram_phrases = Phrases(df.words, min_count=10)
trigram_phrases = Phrases(bigram_phrases[df.words], min_count=5)

# Create Phraser model objects for faster processing
bigram_model = Phraser(bigram_phrases)
trigram_model = Phraser(trigram_phrases)
trigrams = [trigram_model[bigram_model[word.split()]] for word in df.words]

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
lemmatized_words = []
for sent in trigrams:
    doc = nlp(" ".join(sent))
    lemmatized_words.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

In [None]:
# Print the first 50 lemmatized words of the first row
print(lemmatized_words[0][:50])

In [None]:
id2word = Dictionary(lemmatized_words)
id2word.filter_extremes(no_below=10, no_above=0.4)
id2word.compactify()
corpus = [id2word.doc2bow(word) for word in lemmatized_words]

### Create LDA model and print out topics

In [None]:
num_topics = 7
lda_model = LdaMulticore(corpus=corpus, 
                             id2word=id2word, 
                             num_topics=num_topics, 
                             random_state=1,
                             chunksize=30,
                             passes=40,
                             alpha=0.5,
                             eta=0.91,
                             eval_every=1,
                             per_word_topics=True,
                             workers=2)

In [None]:
lda_model.print_topics(7,num_words=10)

By looking at some of the key words we try to derive a topic:
- Topic 0 : "Culture"
- Topic 1 : "UK"
- Topic 2 : "Crimes"
- Topic 3 : "Situational"
- Topic 4 : "Immigrants"
- Topic 5 : "Relationships"
- Topic 6 : "Politics"

### Coherence Score

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=lemmatized_words, 
                                     dictionary=id2word, 
                                     coherence='c_v')
coherence_model_lda.get_coherence()

In [None]:
# Get a list of vectors of topic probabilities
topic_vecs = []
for i in range(len(df.words)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    topic_vecs.append(topic_vec)

In [None]:
# Add topic probabilities into main df. Create a new column for each topic.
topic_columns = ['Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships', 'Politics']
LDA_probs = pd.DataFrame(data=topic_vecs, columns=topic_columns, index=df.index)
df = pd.concat([df, LDA_probs], axis=1)

In [None]:
# Saving the csv file
df.to_csv("frame3.csv", index=False)

In [5]:
import pandas as pd

In [8]:
df = pd.read_csv(r"D:\PROJECTS\transnlp\data\processed\processed_content_with_topics.csv")

In [9]:
df.head()

Unnamed: 0,S No.,Tag,URL,Raw Transcript,Transcript,CleanTag,Year,Names,Title,runtime,...,word_count,diversity,diversity_ratio,Culture,UK,Crimes,Situational,Immigrants,Relationships,Politics
0,0,Michelle Buteau: Welcome to Buteaupia (2020) ...,https://scrapsfromtheloft.com/comedy/michelle-...,['Michelle Buteau’s Netflix special Welcome to...,michelle buteaus netflix special welcome to bu...,Michelle Buteau: Welcome to Buteaupia (2020),2020.0,Michelle Buteau,Welcome to Buteaupia,58.0,...,3222,833,0.258535,0.110807,0.132198,0.006003,0.727972,0.020906,0.00097,0.001144
1,1,Theo Von: No Offense (2016) | Transcript,https://scrapsfromtheloft.com/comedy/theo-von-...,['Theo Von: No Offense was recorded at the Civ...,theo von no offense was recorded at the civic ...,Theo Von: No Offense (2016),2016.0,Theo Von,No Offense,67.0,...,3777,1215,0.321684,0.28331,0.000627,0.003217,0.596115,0.115468,0.000599,0.000663
2,2,Nate Bargatze’s Nashville Christmas (2024) | T...,https://scrapsfromtheloft.com/comedy/nate-barg...,['Nate Bargatze’s Nashville Christmas is a hea...,nate bargatzes nashville christmas is a heartw...,Nate Bargatze’s Nashville Christmas (2024),2024.0,Nate Bargatze’s,Nashville Christmas,61.0,...,2451,890,0.363117,0.185778,0.001202,0.000758,0.630959,0.179455,0.000914,0.000934
3,3,"Your Friend, Nate Bargatze (2024) | Transcript",https://scrapsfromtheloft.com/comedy/your-frie...,"['Your Friend, Nate Bargatze (2024)\nGenre: Co...",your friend nate bargatze comedy standupdirec...,"Your Friend, Nate Bargatze (2024)",2024.0,Nate Bargatze,"Your Friend,",63.0,...,2684,755,0.281297,0.038446,0.001057,0.003672,0.933593,0.020873,0.001378,0.00098
4,4,Ronny Chieng: Love to Hate It (2024) | Transcript,https://scrapsfromtheloft.com/comedy/ronny-chi...,"['[tuning]', '[gentle Hawaiian music playing o...",tuning gentle hawaiian music playing over radi...,Ronny Chieng: Love to Hate It (2024),2024.0,Ronny Chieng,Love to Hate It,65.0,...,3640,1197,0.328846,0.000747,0.00055,0.001193,0.463722,0.079922,0.453276,0.000589
