In [1]:
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print(device)

cuda


In [2]:
import torch
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import string
from collections import Counter 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Ensure GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Step 1: Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Step 2: Create a DataFrame
df = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': [newsgroups.target_names[label] for label in newsgroups.target]
})

df.dropna(subset=['Text'], inplace=True)

# Step 3: Count words and filter by word count
df['WordCount'] = df['Text'].apply(lambda x: len(x.split()))
lower_threshold = df['WordCount'].quantile(0.1)
upper_threshold = df['WordCount'].quantile(0.9)
df = df[(df['WordCount'] > lower_threshold) & (df['WordCount'] < upper_threshold)]

# Randomly sample 500 rows (use the actual dataset size if it's smaller than 500)
#df = df.sample(n=min(10000, len(df)), random_state=42)

df.rename(columns={'Text': 'Comment'}, inplace=True)

In [5]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to encode comments into embeddings
def encode_comments_in_batches(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state.mean(dim=1).cpu()
        all_embeddings.append(embeddings)
    # Concatenate all embeddings
    return torch.cat(all_embeddings, dim=0)

# Optional: Minimal text cleaning (if necessary)
def clean_text(text):
    return text.replace('\n', ' ').strip()

# Apply minimal cleaning
cleaned_texts = [clean_text(text) for text in df['Comment']]
# Encode comments
embeddings = encode_comments_in_batches(cleaned_texts, batch_size=16)



In [30]:
# Step 4: Perform clustering on embeddings
num_topics = 40  # Set the number of topics
kmeans = KMeans(n_clusters=num_topics, random_state=0).fit(embeddings.numpy())
df['topic'] = kmeans.labels_

# Step 5: Calculate coherence score
# Transform text to bag-of-words format for coherence calculation
texts = [comment.split() for comment in df['Comment'].tolist()]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Function to get representative comments for a topic
def get_representative_comments(topic_idx, n_representative=150):
    indices = np.where(df['topic'] == topic_idx)[0]
    centroid = kmeans.cluster_centers_[topic_idx]
    distances = cosine_similarity([centroid], embeddings[indices].numpy())
    closest_indices = np.argsort(distances[0])[:n_representative]
    return [df.iloc[indices[i]]['Comment'] for i in closest_indices]

# Function to find most frequent words in comments
def get_most_frequent_words(comments, top_n=15):
    words = [word for comment in comments for word in comment.split()
             if word not in string.punctuation]
    most_common = [word for word, freq in Counter(words).most_common(top_n)]
    return most_common

# Create lists of most frequent words representing each topic
topic_words = [get_most_frequent_words(get_representative_comments(i)) for i in range(num_topics)]

# Calculate coherence score
cm = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = cm.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.3516313814445421


top 5

topic 3 = 0.5119179329036139

topic 5 = 0.4898835056685732

topic 10 = 0.5281536171510373

topic 15 = 0.5174468094419998

topic 20 = 0.5182541863526895

topic 25 = 0.5206877165695638

topic 30 = 0.5206104688853044

topic 35 = 0.5220296994075871

topic 40 = 0.5216745726384806

topic 45 = 0.5201528389645848


top 10

topic 3 = 0.34896567061446193

topic 5 = 0.3798411251147066

topic 10 = 0.3882506355244847

topic 15 = 0.4012699827936591

topic 20 = 0.4006519914742827

topic 25 = 0.4045064945610071

topic 30 = 0.40204047823245587

topic 35 = 0.405453692537197

topic 40 = 0.40523946667548855

topic 45 = 0.4037661455376878


top 15


topic 5 = 0.3252013509109818

topic 10 = 0.33315294315792754

topic 20 = 0.34223336469297483

topic 30 = 0.3479745209290377

topic 40 = 0.3516313814445421
