# Clustering

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import seaborn as sns
sns.set(color_codes=True)
%config InlineBackend.figure_formats = ['retina']

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from gensim.models import Phrases
from gensim.models.phrases import Phraser

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("frame3.csv")

In [None]:
df

In [None]:
topics = ['Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships', 'Politics']
ax = sns.barplot(x=df[topics].mean().index, y=df[topics].loc[df.rating_type == 1].mean())
ax.set_xticklabels(topics, rotation=40, ha='right')
ax.set_title('Mean Topic Probabilities Across The Entire Dataset')
ax.set(xlabel='Topics', ylabel='Mean Percentage per Transcript', ylim=(0, 0.5))

# K-means

In [None]:
# Prepare X
X = df[topics]
X = StandardScaler().fit_transform(X)
X.shape

In [None]:
# Cluster with differnt n's. Score with silhouette metric
temp_dict = {}
inertias = []
for n_clusters in range(2,15):
    # Initialize the clusterer with n_clusters value and a random generator seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    temp_dict[n_clusters] = [silhouette_avg] 
    
    inertia = clusterer.inertia_
    print("\tThe inertia is :", inertia)
    inertias.append(inertia)

In [None]:
sns.set(font_scale=1.2)
sns.set_style('ticks')
s_scores = pd.DataFrame(temp_dict).T
ax = sns.lineplot(x=s_scores.index, y=s_scores[0], color='teal')
ax.set_xticks(range(2,14))
ax.set_ylabel('Silhouette score')
ax.set_xlabel('Clusters')
ax.figure.tight_layout()

In [None]:
# Create the line plot
ax = sns.lineplot(x=range(2, 15), y=inertias, color='teal')
# Set the y-axis label
ax.set_ylabel('SSE (inertia)')
# Set the x-axis label
ax.set_xlabel('Clusters')
# Adjust the layout of the plot to fit the figure area properly
ax.figure.tight_layout()
# Show the plot
plt.show()

In [None]:
clusterer = KMeans(n_clusters=7, random_state=10)
df['cluster_LDA'] = clusterer.fit_predict(X)

In [None]:
for cluster in range(7):
    # Create a subplot with 1 row and 1 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(4, 4)
     
    ax = sns.barplot(x=df[topics].mean().index, y=df[topics].loc[df.cluster_LDA == cluster].mean())
    ax.set_xticklabels(topics, rotation=40, ha='right')   
    ax.set_title(f'cluster: {cluster}')

In [None]:
df.cluster_LDA.value_counts()

# TF-IDF

### Clean text (get tri- and bi-grams, lemmatize, and take only selected POS)

In [None]:
# Build bigram and trigram Phrases objects
bigram_phrases = Phrases(df.words, min_count=10)
trigram_phrases = Phrases(bigram_phrases[df.words], min_count=5)

# Create Phraser model objects for faster processing
bigram_model = Phraser(bigram_phrases)
trigram_model = Phraser(trigram_phrases)
trigrams = [trigram_model[bigram_model[word.split()]] for word in df.words]

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
lemmatized_words = []
for sent in trigrams:
    doc = nlp(" ".join(sent))
    lemmatized_words.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

In [None]:
# Prepare for KMeans clustering using Tfidf matrix
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, 
                        lowercase=False,
                        min_df = 10,
                        max_df = 0.4,)   

X = tfidf.fit_transform(lemmatized_words)
X.shape

In [None]:
# Cluster with differnt n's. Score with silhouette metric
temp_dict = {}
inertias = []
for n_clusters in range(2,15):
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels = clusterer.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    temp_dict[n_clusters] = [silhouette_avg] 
    
    inertia = clusterer.inertia_
    print("\tThe inertia is :", inertia)
    inertias.append(inertia)

In [None]:
s_scores = pd.DataFrame(temp_dict).T
ax = sns.lineplot(x=s_scores.index, y=s_scores[0], color='teal')
ax.set_xticks(range(2,15, 4))
ax.set_ylabel('Silhouette score')
ax.set_xlabel('Clusters')
ax.figure.tight_layout()

In [None]:
ax = sns.lineplot(x=range(2,15), y=inertias, color='teal')
ax.set_ylabel('SSE (inertia)')
ax.set_xlabel('Clusters')
ax.figure.tight_layout()

In [None]:
clusterer = KMeans(n_clusters=7, random_state=10)
df['cluster_tfidf'] = clusterer.fit_predict(X)

In [None]:
for cluster in range(7):
    # Create a subplot with 1 row and 1 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(4, 4)
    
    ax = sns.barplot(x=df[topics].mean().index, y=df[topics].loc[df.cluster_tfidf == cluster].mean())
    ax.set_xticklabels(topics, rotation=40, ha='right')   
    ax.set_title(f'cluster: {cluster}')

In [None]:
df.cluster_tfidf.value_counts()

In [None]:
# Saving the csv file
df.to_csv("frame4.csv", index=False)

In [5]:
import pandas as pd

In [9]:
df=pd.read_csv(r"D:\PROJECTS\transnlp\data\processed\processed_content_with_clusters.csv")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   S No.                 500 non-null    int64  
 1   Tag                   500 non-null    object 
 2   URL                   500 non-null    object 
 3   Raw Transcript        500 non-null    object 
 4   Transcript            500 non-null    object 
 5   CleanTag              500 non-null    object 
 6   Year                  465 non-null    float64
 7   Names                 500 non-null    object 
 8   Title                 480 non-null    object 
 9   runtime               434 non-null    float64
 10  rating                425 non-null    float64
 11  language              500 non-null    object 
 12  preprocessed_content  500 non-null    object 
 13  rating_type           425 non-null    object 
 14  f_words               500 non-null    int64  
 15  s_words               5

In [11]:
df.head()

Unnamed: 0,S No.,Tag,URL,Raw Transcript,Transcript,CleanTag,Year,Names,Title,runtime,...,diversity_ratio,Culture,UK,Crimes,Situational,Immigrants,Relationships,Politics,cluster_LDA,cluster_tfidf
0,0,Michelle Buteau: Welcome to Buteaupia (2020) ...,https://scrapsfromtheloft.com/comedy/michelle-...,['Michelle Buteau’s Netflix special Welcome to...,michelle buteaus netflix special welcome to bu...,Michelle Buteau: Welcome to Buteaupia (2020),2020.0,Michelle Buteau,Welcome to Buteaupia,58.0,...,0.258535,0.110807,0.132198,0.006003,0.727972,0.020906,0.00097,0.001144,3,4
1,1,Theo Von: No Offense (2016) | Transcript,https://scrapsfromtheloft.com/comedy/theo-von-...,['Theo Von: No Offense was recorded at the Civ...,theo von no offense was recorded at the civic ...,Theo Von: No Offense (2016),2016.0,Theo Von,No Offense,67.0,...,0.321684,0.28331,0.000627,0.003217,0.596115,0.115468,0.000599,0.000663,5,6
2,2,Nate Bargatze’s Nashville Christmas (2024) | T...,https://scrapsfromtheloft.com/comedy/nate-barg...,['Nate Bargatze’s Nashville Christmas is a hea...,nate bargatzes nashville christmas is a heartw...,Nate Bargatze’s Nashville Christmas (2024),2024.0,Nate Bargatze’s,Nashville Christmas,61.0,...,0.363117,0.185778,0.001202,0.000758,0.630959,0.179455,0.000914,0.000934,5,3
3,3,"Your Friend, Nate Bargatze (2024) | Transcript",https://scrapsfromtheloft.com/comedy/your-frie...,"['Your Friend, Nate Bargatze (2024)\nGenre: Co...",your friend nate bargatze comedy standupdirec...,"Your Friend, Nate Bargatze (2024)",2024.0,Nate Bargatze,"Your Friend,",63.0,...,0.281297,0.038446,0.001057,0.003672,0.933593,0.020873,0.001378,0.00098,5,3
4,4,Ronny Chieng: Love to Hate It (2024) | Transcript,https://scrapsfromtheloft.com/comedy/ronny-chi...,"['[tuning]', '[gentle Hawaiian music playing o...",tuning gentle hawaiian music playing over radi...,Ronny Chieng: Love to Hate It (2024),2024.0,Ronny Chieng,Love to Hate It,65.0,...,0.328846,0.000747,0.00055,0.001193,0.463722,0.079922,0.453276,0.000589,6,3
