Datasource: https://github.com/anirudhshenoy/text-classification-small-datasets/tree/master/datasets

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np
import string
import torch
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load your data
#df = pd.read_csv('../Data/YT_title_test_data.csv')
df = df = pd.read_csv('../Data/YT_title_test_data_500.csv')

# Drop rows with missing titles and preprocess
df.dropna(subset=['title'], inplace=True)
df.rename(columns={'title': 'Comment'}, inplace=True)
df.shape

(500, 2)

In [3]:
# Set up stop words and lemmatizer
stop_words = set(stopwords.words('english'))
custom_stop_words = set(string.digits + string.punctuation)
additional_stop_words = ['the', 'and', 'was', 'were', 'with', 'a', 'my', '``']
stop_words.update(custom_stop_words)
stop_words.update(additional_stop_words)
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to preprocess text
def preprocess_text(text):
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator).lower()  # Remove punctuation and lowercase
    word_tokens = word_tokenize(text)
    return " ".join(
        lemmatizer.lemmatize(word)
        for word in word_tokens
        if word not in stop_words
    )

# Preprocess and encode comments
df['Comment'] = df['Comment'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_comments(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = encode_comments(df['Comment'].tolist())

In [20]:
# Perform KMeans clustering
num_topics = 20
kmeans = KMeans(n_clusters=num_topics, random_state=0).fit(embeddings.numpy())
df['topic'] = kmeans.labels_

# Prepare texts and dictionary for coherence calculation
texts = [comment.split() for comment in df['Comment'].tolist()]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Find representative comments and topic words
def get_representative_comments(topic_idx, n_representative=10):
    indices = np.where(df['topic'] == topic_idx)[0]
    centroid = kmeans.cluster_centers_[topic_idx]
    distances = cosine_similarity([centroid], embeddings[indices].numpy())
    closest_indices = np.argsort(distances[0])[:n_representative]
    return [df.iloc[indices[i]]['Comment'] for i in closest_indices]

def get_most_frequent_words(comments, top_n=5):
    words = [
        word
        for comment in comments
        for word in word_tokenize(comment.lower())
        if word not in stop_words and word not in string.punctuation
    ]
    return [word for word, freq in Counter(words).most_common(top_n)]

def get_most_frequent_words_for_topic(topic_idx, top_n=5):
    topic_comments = df[df['topic'] == topic_idx]['Comment'].tolist()  # Get all comments for the topic
    words = [
        word
        for comment in topic_comments
        for word in word_tokenize(comment.lower())
        if word not in stop_words and word not in string.punctuation
    ]
    return [word for word, freq in Counter(words).most_common(top_n)]

topic_words = [get_most_frequent_words_for_topic(i) for i in range(num_topics)]

#topic_words = [get_most_frequent_words(get_representative_comments(i)) for i in range(num_topics)]

#coherance score issue: https://github.com/piskvorky/gensim/issues/3328
# Calculate coherence score
cm = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = cm.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.34728591483563503
