## Comments Analysis

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

EMBED_PATH = "E:/"
DATA_PATH = "C:/Users/doosti/Dropbox (Chapman)/Research/Research Projects/Fitness/Data/"

In [3]:
# Load the data
with open(os.path.join(DATA_PATH,"processed_comments_102423.txt"),"r", encoding="utf-8") as f:
    processed_docs = f.readlines()
comments = pd.read_csv(os.path.join(DATA_PATH, "merged_comments.csv"))
comments = comments[comments.comment_text.notnull()].copy()
comments['processed_text'] = [re.sub("\d+", "", x.strip())for x in processed_docs]
comments['length'] = comments.processed_text.apply(lambda x: len(x.split(',')))
comments['include'] = comments.length > 10
comments = comments[comments.include].copy()
print(comments.shape)

(221979, 11)


In [11]:
# Load the embeddings
#embed_file = "bert_embeddings_221979docs_sentence_lowercase_071123.npy"
#embed_file = "bert_embeddings_221979docs_sentence_original_071123.npy"
embed_file = "bert_embeddings_221979docs_sentence_tokens_071123.npy"

embeddings = np.load(os.path.join(EMBED_PATH, embed_file), allow_pickle=True)
print(embeddings.shape)

(221979, 768)


In [12]:
# KMeans clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Fit the model
kmeans = KMeans(n_clusters=20, random_state=42).fit(embeddings)
labels = kmeans.labels_
print(labels.shape)

# Get the silhouette score
silhouette_score(embeddings, labels)


  super()._check_params_vs_input(X, default_n_init=10)


(221979,)


In [None]:
# Visualize the clusters
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

# Fit the model
pca.fit(embeddings)
X = pca.transform(embeddings)
print(X.shape)

# Plot the clusters
plt.figure(figsize=(10,10))
plt.scatter(X[:,0], X[:,1], c=labels, cmap='tab20')
plt.show()

# Get the top words in each cluster
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Get the top words in each cluster
def get_top_words(X, labels, n=10):
    # Get the top words in each cluster
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(X)
    tfidf = TfidfTransformer()
    X = tfidf.fit_transform(X)
    words = vectorizer.get_feature_names()
    clusters = np.unique(labels)
    top_words = []
    for cluster in clusters:
        # Get the indices of the cluster
        idx = np.where(labels == cluster)[0]
        # Get the words in the cluster
        cluster_words = X[idx,:].sum(axis=0).A1
        # Get the top words in the cluster
        top_idx = np.argsort(cluster_words)[::-1][:n]
        top_words.append([words[i] for i in top_idx])
    return top_words

top_words = get_top_words(comments.processed_text, labels, n=10)
for i, words in enumerate(top_words):
    print("Cluster {}: {}".format(i, words))

In [None]:
# t-SNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)

# Fit the model
X = tsne.fit_transform(embeddings)
print(X.shape)

# Plot the clusters
plt.figure(figsize=(10,10))
plt.scatter(X[:,0], X[:,1], c=labels, cmap='tab20')
plt.show()

In [4]:
comments.processed_text.iloc[0]

'person,read,comment,wish,great,success,health,love,happiness,lot,positive,energy'