In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
import nbformat
from nltk.corpus import stopwords
import spacy
import matplotlib.pyplot as plt

# For BERTopic
from bertopic import BERTopic
import os
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Leet topic
from leet_topic import leet_topic

# For Topic Modeling Evaluation
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
movie_data = pd.read_csv("/Users/npop/Code/Projects/ML7641_Project/Official_Datasets/stage_2/movie_data_valid.csv")
reddit_comments = pd.read_csv("/Users/npop/Code/Projects/ML7641_Project/Official_Datasets/stage_2/reddit_comments_valid.csv")
youtube_comments = pd.read_csv("/Users/npop/Code/Projects/ML7641_Project/Official_Datasets/stage_2/youtube_comments_valid.csv")

In [None]:
reddit_comments.head()

# Text Preprocessing

In [3]:
# Remove any potential duplicates that remain in the data frames
reddit_comments.drop_duplicates(subset=['body', 'post_date'], inplace=True)
# The reddit comments can't have any missing values in the following fields
reddit_comments.dropna(subset=['body'], inplace=True)
reddit_comments.dropna(subset=['post_date'], inplace=True)

In [4]:
# Remove comments that were deleted by the user or removed by the moderators
deleted_comments = reddit_comments[reddit_comments['body'].str.contains('\[deleted\]|\[removed\]')].index.to_list()
reddit_comments.drop(index=deleted_comments, inplace=True)

In [5]:
# There were 9870 comments that had links in them. We will simply remove the links and keep the rest of the text

url_pattern = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
comments_with_links = reddit_comments.loc[reddit_comments['body'].str.contains(url_pattern)].index.to_list()

def remove_links(reddit_comments=reddit_comments, ids=comments_with_links) -> None:
    import re
    
    def sub_link(text:str) -> str:
        comment_text = re.sub(url_pattern, "", text)
        return comment_text
    
    reddit_comments.loc[ids, "body"] = reddit_comments.loc[comments_with_links, "body"].apply(sub_link)
    return None

In [None]:
remove_links()

In [6]:
too_short = reddit_comments[reddit_comments['body'].str.len() < 3].index.to_list()

In [7]:
reddit_comments.drop(index=too_short, inplace=True)

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Create spacy object
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out

In [None]:
lemmatized_comments = lemmatization(reddit_comments['body'])

In [8]:
reddit_comments.reset_index(drop=True, inplace=True)

In [11]:
reddit_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236228 entries, 0 to 236227
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   body                236228 non-null  object 
 1   post_date           236228 non-null  object 
 2   upvotes             236228 non-null  float64
 3   parent_id           236228 non-null  object 
 4   top_level_id        236228 non-null  object 
 5   post_title          236228 non-null  object 
 6   post_id             236228 non-null  object 
 7   movie_title         236228 non-null  object 
 8   movie_release_date  236228 non-null  object 
 9   movie_actors        236228 non-null  object 
 10  valid_post_date     236228 non-null  bool   
dtypes: bool(1), float64(1), object(9)
memory usage: 18.2+ MB


In [9]:
reddit_comments['parent_id'].fillna("Top Level", inplace=True)

In [10]:
reddit_top_level = reddit_comments[reddit_comments['parent_id'].str.contains("Top Level")].copy(deep=True)

# BERTopic

In [12]:
comments_full = reddit_comments['body'].to_list() # Model requires documents to be in List[str] format
comments_toplevel = reddit_top_level['body'].to_list()
os.environ["TOKENIZERS_PARALLELISM"] = "true" # To avoid warning

### Pre-calculate Embeddings

In [13]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(comments_full, show_progress_bar=True)

INFO:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:Use pytorch device_name: mps
Batches: 100%|██████████| 7383/7383 [03:13<00:00, 38.08it/s] 


### Custom UMAP & HBDScan Parameters

In [14]:
# By default, UMAP has stochastic behavior, set random_state for reproducibility
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom')

### Custom Vectorizer

In [23]:
vectorizer = CountVectorizer(stop_words='english', min_df=2, ngram_range=(1, 2))

### Additional Topic Representations

In [17]:
key_rep = KeyBERTInspired()

### Training

In [24]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    representation_model=key_rep,
    top_n_words=10,
)

topics, probs = topic_model.fit_transform(comments_full, embeddings)

In [31]:
topic_model.get_topic_info()[:30]#.to_csv("/Users/npop/Desktop/topic_model.csv")

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,130784,-1_character_movie_characters_bad,"[character, movie, characters, bad, film, does...","[Have you not seen Celery Man?, It's not very ..."
1,0,8516,0_teaser trailer_trailer trailer_watch trailer...,"[teaser trailer, trailer trailer, watch traile...","[Yeah, and what about ""Teaser trailer"" ?, Was ..."
2,1,4631,1_good actor_great actor_actor_director,"[good actor, great actor, actor, director, act...","[Same director as Inside Out and Up so..., He'..."
3,2,4037,2_good actress_actress_actresses_cast,"[good actress, actress, actresses, cast, actin...",[Implying Twilight had a bad leading actress? ...
4,3,4005,3_movie movie_movie_film_good movie,"[movie movie, movie, film, good movie, films, ...","[In what movie?, I will see this movie ( ;) ) ..."
5,4,3880,4_song trailer_trailer song_music trailer_soun...,"[song trailer, trailer song, music trailer, so...","[What is the song in the trailer?, song?, \n[L..."
6,5,2740,5_downvoted_right_yeah right_downvote,"[downvoted, right, yeah right, downvote, downv...","[He's definitely being downvoted, that was the..."
7,6,2480,6_choo choo_choo_fookin prawns_ho ho,"[choo choo, choo, fookin prawns, ho ho, ho, pr...",[Oh ho ho hohoho hoooooooooooooooooooooooooooo...
8,7,2295,7_book book_second book_book_book movie,"[book book, second book, book, book movie, lov...","[The book! You should read the book!, They sh..."
9,8,2166,8_shit pants_intensifies_pants_jeans,"[shit pants, intensifies, pants, jeans, underw...","[I just shit my pants. , Me too. They make me..."


In [None]:
topic_model.visualize_topics()

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(comments_full, reduced_embeddings=reduced_embeddings, hide_annotations=True)

In [None]:
topic_model.visualize_barchart(topics=[0, 1, 2, 3, 4, 7, 17, 22], n_words=5, width=300)

In [None]:
topic_model.visualize_heatmap(top_n_topics=20)

### Quantitative Evaluation

In [35]:
# Generate `X` and `labels` only for non-outlier topics (as they are technically not clusters)
umap_embeddings = topic_model.umap_model.transform(embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X = umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]

In [36]:
silhouette_score(X, labels)

0.6133194

In [None]:
topic_model.get_topics()

#### Coherence

In [None]:
# Extract topics and top words from BERTopic output
topics = topic_model.get_topics()
top_words = {topic: [word for word, _ in topic_model.get_topic(topic)] for topic in topics}

# Preprocess top words (optional)
# preprocess_top_words = preprocess(top_words)

# Calculate coherence score for each topic
coherence_scores = {}
for topic_id, words in top_words.items():
    cm = CoherenceModel(topics=[words], texts=documents, dictionary=dictionary, coherence='c_v') # Use 'c_v' or other coherence measures
    coherence_score = cm.get_coherence()
    coherence_scores[topic_id] = coherence_score

# Average coherence score
average_coherence = sum(coherence_scores.values()) / len(coherence_scores)
print("Average Coherence Score:", average_coherence)


# LeetTopic

In [None]:
leet_df, topic_data = leet_topic.LeetTopic(reddit_comments,
                                            document_field="body",
                                            html_filename="demo.html",
                                            extra_fields=["hdbscan_labels"],
                                            spacy_model="hr_core_news_sm",
                                            max_distance=.45)