In [51]:
import pandas as pd
from bertopic import BERTopic as bt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import re

In [52]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahes\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mahes\AppData\Roaming\nltk_data...


True

In [53]:
stop_words = set(stopwords.words('english'))

In [82]:
data = pd.read_csv('s1-s5_emotions.csv')

In [83]:
data = data.reset_index().rename(columns={'index': 'timestamp'})

In [84]:
data.head()

Unnamed: 0,timestamp,author,episode_number,episode_title,quote,quote_order,season,top_emotion_1,prob_1,top_emotion_2,prob_2
0,0,Monica,1,Monica Gets A Roommate,There's nothing to tell! He's just some guy I ...,0,1,relaxed,0.455074,content,0.193351
1,1,Joey,1,Monica Gets A Roommate,"C'mon, you're going out with the guy! There's ...",1,1,tense,0.199686,surprised,0.155179
2,2,Chandler,1,Monica Gets A Roommate,"All right Joey, be nice. So does he have a hum...",2,1,confused,0.389611,aroused,0.197605
3,3,Phoebe,1,Monica Gets A Roommate,"Wait, does he eat chalk?",3,1,confused,0.439212,surprised,0.289001
4,4,Phoebe,1,Monica Gets A Roommate,"Just, 'cause, I don't want her to go through w...",4,1,surprised,0.202174,confused,0.194774


In [85]:
timestamps = data['season'].tolist()

In [93]:
quotes = data['quote']

In [94]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = [word for word in text.split() if word not in stop_words and len(word) > 2]
    chars = data['author'].unique().tolist()
    for char in chars:
        text = text.replace(char.lower(), '')
    return ' '.join(tokens)

In [95]:
quotes = quotes.apply(preprocess_text)

In [96]:
vectorizer_model = CountVectorizer(ngram_range=(1,3), stop_words = 'english')

In [38]:
# Applying regex on the quotes
quotes = quotes.str.replace(r'[^\w\s]', '') # remove punctuation
quotes = quotes.str.replace(r'\d+', '') # remove numbers
quotes = quotes.str.replace(r'\n', '') # remove newlines
quotes = quotes.str.lower() # lowercase
quotes = quotes.str.strip() # remove leading/trailing spaces
# remove character names from quotes
chars = data['author'].unique().tolist()
chars = [char.lower() for char in chars]
for char in chars:
    quotes = quotes.str.replace(char, '')

In [97]:
quotes = quotes.tolist()

In [98]:
dynamic_topic_model = bt(verbose=True, vectorizer_model=vectorizer_model, n_gram_range=(1,3), min_topic_size=5, nr_topics='auto')

In [99]:
topics, probs = dynamic_topic_model.fit_transform(quotes)

2024-11-02 17:22:17,825 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/725 [00:00<?, ?it/s]

2024-11-02 17:22:31,563 - BERTopic - Embedding - Completed ✓
2024-11-02 17:22:31,565 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-02 17:22:43,887 - BERTopic - Dimensionality - Completed ✓
2024-11-02 17:22:43,887 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-02 17:22:52,117 - BERTopic - Cluster - Completed ✓
2024-11-02 17:22:52,118 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-02 17:22:53,918 - BERTopic - Representation - Completed ✓
2024-11-02 17:22:53,926 - BERTopic - Topic reduction - Reducing number of topics
2024-11-02 17:22:55,970 - BERTopic - Topic reduction - Reduced number of topics from 744 to 519


In [100]:
topics_over_time = dynamic_topic_model.topics_over_time(quotes, timestamps = data['season'], nr_bins=10)

5it [00:19,  3.81s/it]


In [101]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"rachel, little, joey, chandler, know",1308,0.996
1,0,"ross, monica, listens, chandler, joey",568,0.996
2,1,"partay andr, partay, andr, kay, naa kay kay",326,0.996
3,2,"hey hey, hey hey hey, hey, hello hey, hey hello",77,0.996
4,3,"yeah yeah yeah, yeah yeah, yeah, yeah yep, yea...",50,0.996
...,...,...,...,...
1919,471,"woman standing, woman standing glares, walks r...",1,4.600
1920,474,"wont hell, wont hell hole, wont wont hell, arr...",1,4.600
1921,475,"fine happy, fine happy anniversary, happy anni...",1,4.600
1922,489,"yeah wanted make, wanted make dramatic, dramat...",1,4.600


In [75]:
topics_over_time.iloc[4]

Topic                                                     3
Words        hey hey, hey hey hey, hey, hello hey, howd hey
Frequency                                                23
Timestamp                                           -23.179
Name                    3_hey hey_hey hey hey_hey_hello hey
Name: 4, dtype: object

In [102]:
dynamic_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5539,-1_rachel_joey_chandler_yknow,"[rachel, joey, chandler, yknow, know, gonna, o...","[chandler, chandler, chandler]"
1,0,1810,0_ross_monica_chandler_joey,"[ross, monica, chandler, joey, rachel, listens...","[ross, ross, ross]"
2,1,1246,1_kay_kay kay_kay hon_andr naa kay,"[kay, kay kay, kay hon, andr naa kay, naa kay ...","[, naa, kay]"
3,2,386,2_hey hey_hey hey hey_hey_hello hey,"[hey hey, hey hey hey, hey, hello hey, hey hel...","[hey, hey hey, hey hey hey]"
4,3,295,3_yeah yeah yeah_yeah yeah_yeah_yeah yeah yep,"[yeah yeah yeah, yeah yeah, yeah, yeah yeah ye...","[yeah yeah, yeah yeah, yeah yeah yeah]"
...,...,...,...,...,...
514,513,5,513_lens_switch_good maybe switch_connected sw...,"[lens, switch, good maybe switch, connected sw...","[good maybe switch back, lens lens ill right b..."
515,514,5,514_credit_credit card_credit cards_card,"[credit, credit card, credit cards, card, card...",[know probably happened someone musta stolen c...
516,515,5,515_ass talks_ass talks like_barely know talk_...,"[ass talks, ass talks like, barely know talk, ...","[come guys havent talked, cant tell cant guy b..."
517,516,5,516_copa_cabana_french_france,"[copa, cabana, french, france, toast, avec moi...",[well sometimes think selling practice could m...


In [103]:
dynamic_topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)