In [11]:
import pandas as pd

url = 'sample_text_1000.csv'
df = pd.read_csv(url)
df.head(2)


Unnamed: 0,text
0,Proper rest speeds up recovery
1,Reading books improves vocabulary and knowledge


<h3 align="center" style="color: blue;">Text Preprocessing</h3>


## Step 1 : Tokenization

In [12]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk



lowercase_text = df["text"].str.lower()
lowercase_text.head(2)

df['tokens'] = lowercase_text.apply(word_tokenize)
df['tokens'].head(5)

0                 [proper, rest, speeds, up, recovery]
1    [reading, books, improves, vocabulary, and, kn...
2    [reading, books, improves, vocabulary, and, kn...
3    [a, balanced, diet, is, important, for, good, ...
4    [cricket, fans, celebrated, the, victory, loudly]
Name: tokens, dtype: object

## Step 2: Stopword Removal

In [13]:
from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))
df['filtered_words']= df['tokens'].apply(lambda words: [word for word in words if word.isalpha()
                                                          and word not in stop_words])

print('after stopword removal : ', df['filtered_words'])
df.head(3)

after stopword removal :  0                       [proper, rest, speeds, recovery]
1      [reading, books, improves, vocabulary, knowledge]
2      [reading, books, improves, vocabulary, knowledge]
3              [balanced, diet, important, good, health]
4           [cricket, fans, celebrated, victory, loudly]
                             ...                        
995                         [resort, breathtaking, view]
996       [new, smartphone, features, advanced, cameras]
997    [classroom, discussions, encourage, critical, ...
998    [educational, institutions, must, adapt, techn...
999             [hotel, offered, excellent, hospitality]
Name: filtered_words, Length: 1000, dtype: object


Unnamed: 0,text,tokens,filtered_words
0,Proper rest speeds up recovery,"[proper, rest, speeds, up, recovery]","[proper, rest, speeds, recovery]"
1,Reading books improves vocabulary and knowledge,"[reading, books, improves, vocabulary, and, kn...","[reading, books, improves, vocabulary, knowledge]"
2,Reading books improves vocabulary and knowledge,"[reading, books, improves, vocabulary, and, kn...","[reading, books, improves, vocabulary, knowledge]"


## Step 3 : Stemming

In [14]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
df['stemmed_words'] = df['filtered_words'].apply(lambda words: [ps.stem(word) for word in words])

df.head(3)

Unnamed: 0,text,tokens,filtered_words,stemmed_words
0,Proper rest speeds up recovery,"[proper, rest, speeds, up, recovery]","[proper, rest, speeds, recovery]","[proper, rest, speed, recoveri]"
1,Reading books improves vocabulary and knowledge,"[reading, books, improves, vocabulary, and, kn...","[reading, books, improves, vocabulary, knowledge]","[read, book, improv, vocabulari, knowledg]"
2,Reading books improves vocabulary and knowledge,"[reading, books, improves, vocabulary, and, kn...","[reading, books, improves, vocabulary, knowledge]","[read, book, improv, vocabulari, knowledg]"


<h3 align="center" style="color: blue;">Vectorization</h3>


In [15]:
X = df.drop(["text","tokens","filtered_words"], axis=1)

X.head(2)

Unnamed: 0,stemmed_words
0,"[proper, rest, speed, recoveri]"
1,"[read, book, improv, vocabulari, knowledg]"


In [16]:
## converting stemmed words into text

X["stem_text"] = X['stemmed_words'].apply(lambda words: " ".join(words))
X.head(2)
X = X.drop(["stemmed_words" ], axis=1)
X.head(2)

Unnamed: 0,stem_text
0,proper rest speed recoveri
1,read book improv vocabulari knowledg


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X["stem_text"])


<h3 align="center" style="color: blue;">Model Training</h3>


In [18]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters = 23, init="k-means++", random_state=42)
model.fit(X_tfidf)

X['cluster'] = model.labels_
X['cluster'].value_counts()


cluster
16    248
7      72
12     66
11     55
5      45
2      45
15     43
4      41
18     40
13     35
1      29
8      27
10     27
6      25
3      25
20     24
14     23
17     23
22     22
0      22
21     22
19     21
9      20
Name: count, dtype: int64

<h3 align="center" style="color: blue;"> Accuracy Analysis</h3>


In [19]:
from sklearn.metrics import silhouette_score

score = silhouette_score(X_tfidf, model.labels_)
print("Silhouette Score:", score)

Silhouette Score: 0.5144055368646414


## Insights 
1. We have tried with different values of k to get the silhouette_score more than 0.5 which is considered as a very good score.
2. Cluster sizes are reasonably balanced.
3. This means K=23 is not overfitting.

## View sample texts from each cluster

In [20]:
for c in sorted(X['cluster'].unique()):
    print(f"\n===== Cluster {c} =====\n")
    print("\n".join(X[X['cluster']==c]['stem_text'].head(5)))



===== Cluster 0 =====

beach destin peac beauti
beach destin peac beauti
beach destin peac beauti
beach destin peac beauti
beach destin peac beauti

===== Cluster 1 =====

virtual realiti becom realist
virtual realiti becom realist
virtual realiti becom realist
virtual realiti becom realist
virtual realiti becom realist

===== Cluster 2 =====

trip itinerari includ mani excit spot
footbal match excit full energi
trip itinerari includ mani excit spot
trip itinerari includ mani excit spot
trip itinerari includ mani excit spot

===== Cluster 3 =====

resort breathtak view
resort breathtak view
resort breathtak view
resort breathtak view
resort breathtak view

===== Cluster 4 =====

student need proper guidanc succeed
student need proper guidanc succeed
student need proper guidanc succeed
student need proper guidanc succeed
assign help student practic concept

===== Cluster 5 =====

hydrat essenti bodi function
hydrat essenti bodi function
cloud comput essenti modern applic
hydrat essenti