In [2]:
import pandas as pd

In [8]:
# prompt: read the patient_review_data.csv and store  in a df
# Read the CSV file into a DataFrame.
# Private data source
df = pd.read_csv('../Data/patient_review_data.csv')
df['Comment']

0      The nurses were exceptionally compassionate an...
1      I appreciated the clear communication from the...
2        The cleanliness of the hospital was remarkable.
3           My pain was managed well throughout my stay.
4                  I felt rushed during my consultation.
                             ...                        
295    Experiencing anger from a caregiver was both s...
296    The awful experience I had with the staff has ...
297    The delay in treatment had a harmful effect on...
298    I hate to say it, but the service here has det...
299    Witnessing cops within the hospital premises w...
Name: Comment, Length: 300, dtype: object

In [None]:
df = pd.read_csv('book_title_test_data.csv')
# Keep only the first 500 rows in df
df = df.iloc[:500]

# Rename 'title' column to 'Comment'
df.rename(columns={'title': 'Comment'}, inplace=True)
df['Comment']

0      15 Highly Important Questions About Adulthood,...
1      250 Nuns Just Cycled All The Way From Kathmand...
2      Australian comedians "could have been shot" du...
3      Lycos launches screensaver to increase spammer...
4      Fußball-Bundesliga 2008–09: Goalkeeper Butt si...
                             ...                        
495    '90s Cartoons With Human Eyes May Ruin Your Ch...
496    Wait, WTF Is "I Saw Mommy Kissing Santa Claus"...
497           First Women Win Seats in Kuwait Parliament
498                   Were You A Scene Kid Or An Emo Kid
499    Opposition leader takes early lead in Sierra L...
Name: Comment, Length: 500, dtype: object

In [9]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

import nltk
from collections import Counter
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode comments into embeddings
def encode_comments(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

# Encode comments
embeddings = encode_comments(df['Comment'].tolist())


In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch

In [13]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set of English stopwords and initialize lemmatizer
stop_words = set(stopwords.words('english'))
custom_stop_words = ['the', 'and', 'was', 'were', 'with', 'a', 'my', 'to']
stop_words.update(custom_stop_words)

lemmatizer = WordNetLemmatizer()

def encode_comments(texts):
    # Function to remove stopwords
    def remove_stopwords(text):
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
        return " ".join(filtered_text)

    # Function to remove stopwords and lemmatize
    import string

    def preprocess_text(text):
        # Remove punctuation from text
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator).lower()  # Lowercasing the text here

        word_tokens = word_tokenize(text)
        filtered_and_lemmatized_text = [
            lemmatizer.lemmatize(word)
            for word in word_tokens
            if word not in stop_words
        ]
        return " ".join(filtered_and_lemmatized_text)

    # Remove stopwords from each comment
    cleaned_texts = [preprocess_text(text) for text in texts]

    # Tokenization and encoding
    encoded_input = tokenizer(cleaned_texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

# Encode comments
embeddings = encode_comments(df['Comment'].tolist())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sajjadislam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Perform clustering on embeddings (example: K-Means)
num_topics = 3  # Set the number of topics
kmeans = KMeans(n_clusters=num_topics, random_state=0).fit(embeddings.numpy())
df['topic'] = kmeans.labels_

# For Coherence Score, transform BERT embeddings to bag-of-words
texts = [comment.split() for comment in df['Comment'].tolist()]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Use KMeans centroids to find closest comments for each topic
def get_representative_comments(topic_idx, n_representative=10):
    indices = np.where(df['topic'] == topic_idx)[0]
    centroid = kmeans.cluster_centers_[topic_idx]
    distances = cosine_similarity([centroid], embeddings[indices].numpy())
    closest_indices = np.argsort(distances[0])[:n_representative]
    return [df.iloc[indices[i]]['Comment'] for i in closest_indices]

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [15]:
def get_most_frequent_words(comments, top_n=3):
    # Tokenize words, convert to lower case, and filter out stop words and punctuation
    words = [word for comment in comments
             for word in word_tokenize(comment.lower())
             if word not in stop_words and word not in string.punctuation]

    # Find the most common words
    most_common = [word for word, freq in Counter(words).most_common(top_n)]
    return most_common

# Create lists of most frequent words representing each topic
topic_words = [get_most_frequent_words(get_representative_comments(i)) for i in range(num_topics)]
topic_words

[['insulted', 'staff', 'hipaa'],
 ['staff', 'unresolved', 'behavior'],
 ['time', 'care', 'doctors']]

In [16]:
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v')
#cm = CoherenceModel(topics=topic_words, dictionary=dictionary, coherence='u_mass')
coherence_score = cm.get_coherence()
print(f'Coherence Score: {coherence_score}')


Coherence Score: 0.6358999695610196
