<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/word_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re

In [2]:
data1 = pd.read_csv('bbctext.csv')
data1.shape

(2225, 2)

In [3]:
data1.columns

Index(['category', 'text'], dtype='object')

In [4]:
import nltk
#nltk.download()

In [7]:
#remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#Adding custom stop words
new_words=[]
new_words = ["some","one","like","time","br","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad', 
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'look','many', 'see', 'seen',
            'http','com', 'code', 'gt', 'lt', 'li', 'ol', 'ul', 'image', 'stack', 'imgur', 'div']
stop_words = stop_words.union(new_words)

In [9]:
#Lemmatisation
from nltk.stem.wordnet import WordNetLemmatizer

In [10]:
data1['text'] = data1['text'].astype(str)

In [12]:
nltk.download('wordnet')
corpus = []
for i in range(0, data1.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', data1['text'][i])
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [13]:
corpus[300]

'star pay tribute actor davis hollywood star including spike lee burt reynolds oscar nominee alan alda paid tribute actor ossie davis funeral new york veteran star ossie davis known civil right activist died miami age february friend family including actress ruby dee wife year gathered riverside church saturday present service former u president bill clinton singer harry belafonte gave eulogy president united state said mr clinton gave gave year old found dead last weekend hotel room florida making film police said appeared died natural cause davis acting debut starring sidney poiter frequently collaborated director spike lee starring seven lee film including jungle fever right malcolm x attallah shabazz daughter activist malcolm x recalled famous eulogy delivered davis father funeral harlem come bid farewell finest hope said quoting man knew uncle ossie ditto ossie hero still said aviator star alan alda family friend forty year ossie beauty want badly someday dignity little anyway add

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
## Applying tf-idf vectoriser to the corpus

from sklearn.feature_extraction.text import TfidfVectorizer
# Fit the tf-idf model
tfv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df = 0.0005)

In [16]:
# tokenize and build vocab
tfmat = tfv.fit_transform(corpus)
tfmat.shape

(2225, 76608)

In [17]:
feature_names=tfv.get_feature_names()

In [18]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfmat)

clusters = km.labels_.tolist()

CPU times: user 1min 12s, sys: 15.6 s, total: 1min 27s
Wall time: 1min 10s


In [19]:
df = pd.DataFrame(data1['text'])
df['category']=data1['category']
df['cluster']=clusters
df.head()

Unnamed: 0,text,category,cluster
0,tv future in the hands of viewers with home th...,tech,4
1,worldcom boss left books alone former worldc...,business,2
2,tigers wary of farrell gamble leicester say ...,sport,1
3,yeading face newcastle in fa cup premiership s...,sport,1
4,ocean s twelve raids box office ocean s twelve...,entertainment,0


In [20]:
df['cluster'].value_counts()

2    614
1    520
4    391
3    350
0    350
Name: cluster, dtype: int64

In [21]:
ct_cluster =pd.crosstab(index=df['cluster'],
           columns=df['category'],
           values=df['cluster'],
           aggfunc='count',
           normalize='index').round(3)
ct_cluster

category,business,entertainment,politics,sport,tech
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.989,0.0,0.003,0.009
1,0.0,0.008,0.027,0.96,0.006
2,0.816,0.049,0.086,0.018,0.031
3,0.009,0.0,0.989,0.0,0.003
4,0.015,0.015,0.01,0.0,0.959


In [22]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[24535,  4574,  6175, ..., 47733, 47732,     0],
       [27088, 21377, 49810, ..., 46772, 46771, 38303],
       [57441,  7157, 12844, ..., 36241, 36240, 76607],
       [43304, 36219, 20710, ..., 46403, 46402,     0],
       [42365, 49024, 27088, ..., 43590, 43589, 38303]])

In [23]:
df_centroids = pd.DataFrame(order_centroids)
df_centroids.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,76568,76569,76570,76571,76572,76573,76574,76575,76576,76577,76578,76579,76580,76581,76582,76583,76584,76585,76586,76587,76588,76589,76590,76591,76592,76593,76594,76595,76596,76597,76598,76599,76600,76601,76602,76603,76604,76605,76606,76607
0,24535,4574,6175,64387,5119,47056,725,43602,75793,57441,1843,10907,24211,62119,63136,70457,770,18416,12482,45353,45636,43177,51620,69126,44673,70806,43723,62157,56663,7427,32651,7728,8141,56742,74731,51838,30923,7718,10093,10360,...,47696,47694,47693,47692,47690,47689,47688,47727,47729,47730,47746,47759,47758,47757,47756,47755,47754,47753,47752,47751,47750,47749,47748,47747,47745,47731,47744,47743,47742,47741,47740,47739,47738,47737,47736,47735,47734,47733,47732,0
1,27088,21377,49810,74562,57441,40458,67406,15655,24721,12000,33392,75793,10514,73178,49656,61634,59734,75324,12114,4770,11019,34294,57049,59817,36569,28305,70584,29662,37197,62344,26313,72616,42169,3689,46279,46528,76111,62364,74505,53287,...,46734,46733,46732,46731,46730,46729,46728,46727,46764,46766,46812,46767,46811,46810,46808,46807,46806,46805,46804,46798,46797,46796,46795,46794,46793,46791,46790,46789,46788,46787,46783,46782,46781,46780,46778,46776,46774,46772,46771,38303
2,57441,7157,12844,75793,25036,5180,40137,58589,29374,61115,20321,43304,51415,46241,53635,51966,28688,20229,11201,21985,14894,2629,19001,8828,14727,44673,42737,76483,64610,16525,56402,1745,65105,36569,70806,32921,22615,29216,34993,24821,...,36226,36225,36222,36263,36221,36220,63434,36218,36217,36216,36215,63435,36213,36212,36236,36237,36238,36239,36262,63432,36260,36258,36257,36256,36255,36254,36253,36252,36251,36249,36248,36247,36246,36245,36244,36243,36242,36241,36240,76607
3,43304,36219,20710,57441,47998,6847,69261,28688,42036,8424,38835,43316,67282,31659,51507,51506,43320,10644,58933,43244,49464,37114,13804,59927,37748,52630,43381,9357,8050,34442,73025,68922,69030,27630,27635,69031,14847,35705,31190,70806,...,46370,46368,46367,46365,46362,46360,46359,46356,46397,46400,46431,46401,46428,46427,46426,46425,46424,46423,46422,46421,46420,46419,46418,46417,46416,46415,46414,46413,46412,46411,46410,46409,46408,46407,46406,46405,46404,46403,46402,0
4,42365,49024,27088,67518,57441,72036,13299,60591,62855,44419,43602,18267,62251,41600,8254,44514,72822,72645,46441,48439,71810,66648,17973,25036,43304,59695,39431,44673,70457,33916,52021,26974,33241,3274,41842,24459,16170,12844,73829,49810,...,43569,43568,43567,43565,43564,43562,43561,43560,43558,43557,43555,43554,43585,43587,43637,43588,43629,43626,43623,43620,43619,43618,43616,43615,43614,43613,43612,43610,43607,43603,43601,43600,43599,43598,43595,43592,43591,43590,43589,38303


In [24]:
df_centroids.shape

(5, 76608)

In [25]:
for index, row in df_centroids.iterrows():
    print(feature_names[row[0]], 
          feature_names[row[1]], 
          feature_names[row[2]],
          feature_names[row[3]],
          feature_names[row[4]],
          feature_names[row[5]])

film award best star band oscar
game england player win said match
said bn company year firm bank
mr labour election said party blair
mobile phone game technology said user


In [26]:
ct_cluster

category,business,entertainment,politics,sport,tech
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.989,0.0,0.003,0.009
1,0.0,0.008,0.027,0.96,0.006
2,0.816,0.049,0.086,0.018,0.031
3,0.009,0.0,0.989,0.0,0.003
4,0.015,0.015,0.01,0.0,0.959
