Datasource: https://github.com/anirudhshenoy/text-classification-small-datasets/tree/master/datasets

In [11]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np
import string
import torch
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Load your data
df = pd.read_csv('../Data/Final_Synthetic_Patient_Feedback_Dataset.csv')

# Drop rows with missing titles and preprocess
df.dropna(subset=['Patient Feedback'], inplace=True)
df.rename(columns={'Patient Feedback': 'Comment'}, inplace=True)
df.shape

(450, 1)

In [13]:
# Set up stop words and lemmatizer
stop_words = set(stopwords.words('english'))
custom_stop_words = set(string.digits + string.punctuation)
additional_stop_words = ['the', 'and', 'was', 'were', 'with', 'a', 'my', '``']
stop_words.update(custom_stop_words)
stop_words.update(additional_stop_words)
lemmatizer = WordNetLemmatizer()

In [14]:
# Function to preprocess text
def preprocess_text(text):
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator).lower()  # Remove punctuation and lowercase
    word_tokens = word_tokenize(text)
    return " ".join(
        lemmatizer.lemmatize(word)
        for word in word_tokens
        if word not in stop_words
    )

# Preprocess and encode comments
df['Comment'] = df['Comment'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_comments(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

embeddings = encode_comments(df['Comment'].tolist())

In [15]:
# Define the range of num_topics and top_n values
num_topics_list = [3, 5, 10, 15, 20]
top_n_list = [5, 10, 15, 20]

# Stop words and punctuation removal
stop_words = set(stopwords.words("english"))

# Store coherence scores
coherence_scores = []

for num_topics in num_topics_list:
    kmeans = KMeans(n_clusters=num_topics, random_state=0, n_init=10).fit(embeddings)
    df["topic"] = kmeans.labels_

    # Prepare texts and dictionary for coherence calculation
    texts = [comment.split() for comment in df["Comment"].tolist()]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    def get_most_frequent_words_for_topic(topic_idx, top_n=5):
        """Get the most frequent words for a given topic."""
        topic_comments = df[df["topic"] == topic_idx]["Comment"].tolist()
        words = [
            word
            for comment in topic_comments
            for word in word_tokenize(comment.lower())
            if word not in stop_words and word not in string.punctuation
        ]
        return [word for word, freq in Counter(words).most_common(top_n)]

    for top_n in top_n_list:
        topic_words = [get_most_frequent_words_for_topic(i, top_n) for i in range(num_topics)]

        # Calculate coherence score
        cm = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence="c_v")
        coherence_score = cm.get_coherence()

        # Store results
        coherence_scores.append({
            "num_topics": num_topics,
            "top_n": top_n,
            "coherence": coherence_score
        })

# Convert results into a dataframe for better visualization
df_coherence = pd.DataFrame(coherence_scores)

In [16]:
# Display the dataframe
print(df_coherence)

    num_topics  top_n  coherence
0            3      5   0.905850
1            3     10   0.704385
2            3     15   0.375007
3            3     20   0.313978
4            5      5   0.947034
5            5     10   0.585847
6            5     15   0.419717
7            5     20   0.325357
8           10      5   0.904887
9           10     10   0.614634
10          10     15   0.465492
11          10     20   0.400925
12          15      5   0.848865
13          15     10   0.573744
14          15     15   0.445961
15          15     20   0.381808
16          20      5   0.892667
17          20     10   0.608621
18          20     15   0.471359
19          20     20   0.420186
