# Q6. Clustering 

In [20]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import string
from matplotlib import pyplot as plt
import numpy as np
from scipy.spatial import distance
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from nltk.stem import WordNetLemmatizer


## Reading all files

In [21]:
path="Datasets/Question-6/dataset"
list=os.listdir(path)
list.sort()


## Extracting labels

In [22]:
labels=[]
k=5

for i in list:
    temp=i.split('_')
    temp=temp[1].split('.')
    labels.append(temp[0])


count_dict={}
    
for i in range(k):
    count_dict[i+1]=0

for i in labels:
    count_dict[int(i)]+=1

print("Total lables:- ", len(labels),"\n",count_dict)

Total lables:-  1725 
 {1: 410, 2: 286, 3: 317, 4: 411, 5: 301}


## Reading each article 

In [23]:
corpus=[]
for i in range (len(list)):
    
    file = open(path+"/"+list[i],mode='rb')
     
    all_of_it = file.read().decode('utf-8',errors='ignore')
    
    file.close()
    
    corpus.append(all_of_it)
# print(corpus)

### Removing numbers and punctuations:
Numbers:- 12,160  or 160M doesn't cary any particular inormation so we have to remove it

Punctuation:-All the punctuation marks according to the priorities should be dealt with. For example: “.”, “,”,”?” are important punctuations that should be retained while others need to be removed.

In [24]:
for i in range (len(list)):
        
    txt=corpus[i]
    
    regex = re.compile('[^a-zA-Z]')
    txt=regex.sub(' ', txt)
    txt = ''.join(txt)
    corpus[i]=txt


### Removing small words :-
A small word with length less than or equal to 3 does not carry any significance information so we can drop that words have lenght less than 3.

In [25]:
for itr in range (len(list)):
    
    txt1=corpus[itr].split(' ')
    txt="" 
    for i in txt1:
        if(len(i)>3 ):
            txt+=" "+i
    corpus[itr]=txt

### Stemming:-
The goal of both stemming is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. With that being said, stemming helps us reduce the number of overall terms to certain “root” terms.

For example: Organizer, organizes, organization, organized all these get reduced to a root term, maybe “organiz”.

In [26]:
for i in range (len(list)):
    
    stemmer= PorterStemmer()

    txt1=word_tokenize(corpus[i])
    txt=""
    for word in txt1:
        txt+=" "+stemmer.stem(word)
    corpus[i]=txt
    


### Lemmatizer

In [27]:
for i in range (len(list)):
         
    lemmatizer=WordNetLemmatizer()
    
    txt1=word_tokenize(corpus[i])
    txt=""
    for word in txt1:
        txt+=" "+lemmatizer.lemmatize(word)
    corpus[i]=txt

### Removing stop words:-
When data analysis needs to be data driven at the word level, the commonly occurring words (stop-words) should be removed. One can either create a long list of stop-words or one can use predefined language specific libraries.

In [28]:
my_stop_words = text.ENGLISH_STOP_WORDS
print(my_stop_words)

frozenset({'around', 'not', 'was', 'while', 'toward', 'more', 'less', 'each', 'also', 'now', 'before', 'serious', 'thin', 'twelve', 'etc', 'whoever', 'made', 'besides', 'cry', 'often', 'every', 'two', 'however', 'someone', 'only', 'hence', 'must', 'front', 'them', 'who', 'almost', 'moreover', 'third', 'latter', 'found', 'onto', 'thus', 'why', 'do', 'eg', 'fifteen', 'for', 'formerly', 'on', 'upon', 'full', 'cant', 'whole', 'as', 'can', 'us', 'cannot', 'sixty', 'but', 'then', 'other', 'anyhow', 'inc', 'everything', 'him', 'meanwhile', 'myself', 'own', 'would', 'these', 'move', 'sometime', 'hundred', 'when', 'beside', 'whereas', 'are', 'eight', 'any', 'something', 'whenever', 'co', 'thru', 'most', 'during', 'which', 'within', 'mine', 'with', 'became', 'hereby', 'you', 'none', 'please', 'will', 'therefore', 'after', 'alone', 'such', 'how', 'beyond', 'thick', 'much', 'nowhere', 'fifty', 'hereupon', 'else', 'first', 'here', 'system', 'thereupon', 'de', 'anyone', 'perhaps', 'again', 'four', '

# Vectorizer

## TfidfVectorizer:-

It will transform the text into the feature vectors and used as input to the estimator.

The vocabulary is the dictionary that will convert each token or word in the matrix and it will get the feature index.

In [40]:
vectorizer = TfidfVectorizer(stop_words=my_stop_words)
X = vectorizer.fit_transform(corpus)

feature=vectorizer.get_feature_names()

X=X.toarray()
print(X.shape)
print(feature,len(feature))

(1725, 16275)
['aaa', 'aadc', 'aaliyah', 'aamir', 'aara', 'aaron', 'abacu', 'abandon', 'abat', 'abba', 'abbasi', 'abbey', 'abbott', 'abdellatif', 'abdomin', 'abduct', 'abdul', 'abdullah', 'abdullatif', 'abeb', 'abensur', 'aberdeen', 'aberr', 'aberystwyth', 'abet', 'abey', 'abeyi', 'abid', 'abigail', 'abil', 'abish', 'abiyot', 'abl', 'abnorm', 'abolish', 'abolit', 'abort', 'abortionist', 'abov', 'abraham', 'abramovich', 'abroad', 'abrupt', 'abruptli', 'absa', 'absenc', 'absent', 'absente', 'absolut', 'absorb', 'abstain', 'abstract', 'absurd', 'abtahi', 'abund', 'abundantli', 'aburiz', 'abus', 'abut', 'abuzz', 'abysm', 'academ', 'academi', 'acapulco', 'acasuso', 'acceler', 'accent', 'accept', 'access', 'accessori', 'accid', 'accident', 'acclaim', 'acclim', 'acclimatis', 'accolad', 'accommod', 'accomod', 'accompani', 'accomplic', 'accomplish', 'accord', 'accosi', 'account', 'accoust', 'accredit', 'accret', 'accumul', 'accur', 'accura', 'accuraci', 'accus', 'accustom', 'ace', 'aceh', 'achi

# K-Means
K-means algorithm is an iterative algorithm that tries to partition the dataset into Kpre-defined distinct non-overlapping subgroups (clusters) where each data point belongs to only one group. It tries to make the inter-cluster data points as similar as possible while also keeping the clusters as different (far) as possible. It assigns data points to a cluster such that the sum of the squared distance between the data points and the cluster’s centroid (arithmetic mean of all the data points that belong to that cluster) is at the minimum. The less variation we have within clusters, the more homogeneous (similar) the data points are within the same cluster.

### Selecting intitial k:

In [61]:
n=X.shape[0]
c=X.shape[1]
k=5

## Selecting random centers:

In [74]:
index=np.random.choice(1725,5,replace=False)
centers=X[index]
print(centers)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [75]:
dict_centroid={}


for itr in range(20):

    for j in range(k):
        temp=[]
        dict_centroid[j]=temp    
    
    
    for i in range(n):
        
        distance_array=[]
        
        for j in range(k):
            distance_array.append(distance.euclidean(X[i], centers[j] ))
        
        minpos = distance_array.index(min(distance_array))
                 
        dict_centroid[minpos].append(X[i])
        
    print("Iteration ",itr,end="-> ")    
    
    for j in range(k):
        print(len(dict_centroid[j]),end=' ')
        centers[j]=np.mean(np.asarray(dict_centroid[j]),axis=0)
        
    print("")


Iteration  0-> 458 320 219 295 433 
Iteration  1-> 471 415 228 209 402 
Iteration  2-> 437 432 276 174 406 
Iteration  3-> 433 365 351 166 410 
Iteration  4-> 431 322 392 162 418 
Iteration  5-> 439 294 404 159 429 
Iteration  6-> 444 275 414 152 440 
Iteration  7-> 448 265 415 151 446 
Iteration  8-> 449 260 416 150 450 
Iteration  9-> 448 255 416 149 457 
Iteration  10-> 446 250 417 148 464 
Iteration  11-> 447 243 417 148 470 
Iteration  12-> 450 237 417 148 473 
Iteration  13-> 450 233 417 147 478 
Iteration  14-> 450 228 417 146 484 
Iteration  15-> 448 226 417 145 489 
Iteration  16-> 449 222 417 144 493 
Iteration  17-> 448 221 417 144 495 
Iteration  18-> 448 221 417 144 495 
Iteration  19-> 448 221 417 144 495 


### Final Centroids:

In [45]:
centers

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00092793, 0.        , 0.        , ..., 0.        , 0.00069542,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00018538, 0.00033722, 0.00106842, ..., 0.00024994, 0.        ,
        0.00049913]])

# Prediction

In [76]:
prediction_labels=[]

for i in range(n):

    distance_array=[]

    for j in range(k):
        distance_array.append(distance.euclidean(X[i], centers[j] ))

    minpos = distance_array.index(min(distance_array))

    prediction_labels.append(minpos+1)
   
    

# Performance

In [77]:
from sklearn.metrics.cluster import homogeneity_score
homogeneity_score(labels, prediction_labels)

0.7349135101081509