# NLP Grouping

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer 
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD

In [None]:
df = pd.read_csv('./ready_for_grouping/cleaned_data_31OCT.csv')

In [None]:
df.head()

In [None]:
X = df['primary_cause']

In [None]:
tf = TfidfVectorizer(stop_words='english')

In [None]:
X_transformed_tf = tf.fit_transform(X)

In [None]:
tf.get_feature_names_out()

In [None]:
words_tf = pd.DataFrame(X_transformed_tf.todense(),columns=tf.get_feature_names_out())

In [None]:
words_tf.sum().sort_values(ascending=False)

In [None]:
# finding the best k for kmean using TfidfVectorizer
score_list_tf = []
inertia_list_tf = []
for k in range(2,21):
    kmeans = KMeans(n_clusters=k,random_state=44)
    kmeans.fit(X_transformed_tf)
    inertia_list_tf.append(kmeans.inertia_)
    score_list_tf.append(metrics.silhouette_score(X_transformed_tf,kmeans.labels_))  

In [None]:
plt.plot(range(2,21),inertia_list_tf,marker ='o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('inertia');

In [None]:
plt.plot(range(2,21),score_list_tf,marker ='o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('Silhouette Scores');

In [None]:
cvt = CountVectorizer(stop_words='english',ngram_range=(1,2))

In [None]:
X_transformed_cvt = cvt.fit_transform(X)

In [None]:
# finding the best k for kmean using CountVectorizer
score_list_cvt = []
inertia_list_cvt = []
for k in range(2,21):
    kmeans = KMeans(n_clusters=k,random_state=44)
    kmeans.fit(X_transformed_cvt)
    inertia_list_cvt.append(kmeans.inertia_)
    score_list_cvt.append(metrics.silhouette_score(X_transformed_tf,kmeans.labels_))

In [None]:
plt.plot(range(2,21),inertia_list_cvt,marker = 'o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('inertia');

In [None]:
plt.plot(range(2,21),score_list_cvt,marker = 'o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('Silhouette Scores');

In [None]:
# applying PDA
ts = TruncatedSVD(n_components=100)

In [None]:
X_transformed_cvt_ts = ts.fit_transform(X_transformed_cvt)

In [None]:
score_list_cvt = []
inertia_list_cvt = []
for k in range(2,21):
    kmeans = KMeans(n_clusters=k,random_state=44)
    kmeans.fit(X_transformed_cvt_ts)
    inertia_list_cvt.append(kmeans.inertia_)
    score_list_cvt.append(metrics.silhouette_score(X_transformed_tf,kmeans.labels_))

In [None]:
plt.plot(range(2,21),inertia_list_cvt,marker = 'o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('inertia');

In [None]:
plt.plot(range(2,21),score_list_cvt,marker = 'o')
plt.xlabel('number of Clusters')
plt.ylabel('Score')
plt.title('Silhouette Scores');

In [None]:
# best grouping result comes from using TfidfVectorizer with k = 5
km_final = KMeans(n_clusters=5,random_state=44)

In [None]:
km_final.fit(X_transformed_tf)

In [None]:
len(km_final.labels_)

In [None]:
df.shape

In [None]:
df['text_group'] = km_final.labels_

In [None]:
df['text_group'].value_counts()

In [None]:
df.to_csv('./grouped/cleaned_data_text_grouped.csv')