# Text Clustering using K-Means

### 2017 Dec Shilpa Jain

## A function is defined using K Means algorithm

In [45]:
import gensim
from gensim import similarities

import random

num_iterations = 200

def k_means ( vecs, V, K ):
    # get the number of vectors
    N = len(vecs)

    # build an index from the input vectors
    index = similarities.SparseMatrixSimilarity(vecs, V)
    
    # store the cluster centers in the following array
    centers = []
    
    # store the indices of the vectors belonging to different clusters in the following array
    clusters = []
    
    # initialize the cluster centers
    
    # the center of the first cluster is randomly chosen
    cid = random.randint(0, N - 1)
    centers.append(vecs[cid])
    
    # the center of the following clusters are chosen in such a way that they are far away from each other
    for k in range(1, K):
        sims = []
        for n in range(N):
            sims.append((n, 0.0))
        for l in range(k):
            cur_sims = list(enumerate(index[centers[l]]))
            for n in range(N):
                sims[n] = (n, sims[n][1] + cur_sims[n][1])
        sorted_sims = sorted(sims, key=lambda item: item[1])
        cid = sorted_sims[0][0]
        centers.append(vecs[cid])
    
    # iteratively re-assign vectors to clusters
    for j in range(num_iterations):
        
        # compute the similarities between the cluster centers to each vector
        sims = [list(enumerate(index[c])) for c in centers]
        
        clusters = []
        for k in range(K):
            clusters.append([])
        
        # for each vector, assign it to a cluster based on which cluster center is the closest to it
        for n in range(N):
            scores = [(k, sims[k][n]) for k in range(K)]
            sorted_scores = sorted(scores, key=lambda item: -item[1][1])
            c = sorted_scores[0][0]
            clusters[c].append(n)
        
        # update the cluster centers
        centers = []
        for k in range(K):
            centers.append(compute_center(vecs, clusters[k]))
            
    return clusters
    
def compute_center ( vecs, cluster ):
    sum = {}
    for n in cluster:
        vec = vecs[n]
        for (id, val) in vec:
            if (id in sum):
                sum[id] = sum[id] + val
            else:
                sum[id] = val
    
    size = len(cluster)
    
    sorted_keys = sorted(sum.keys(), key=lambda item: item)
    
    center = []
    for key in sorted_keys:
        center.append((key, sum[key]/size))
    
    return center

## Read input file which is a csv containing 5 documents, each per row

In [24]:

import sys
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_8760db995d144d1cab8bb99f8e30e4d7 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='rCEutCTVbYiDKDqWrSU-G_YD5-YbqUZRhXBboFD25PHM',
    ibm_service_instance_id="iam-ServiceId-315c1a38-7c02-4c01-90be-598fa4710933",
    ibm_auth_endpoint="https://iam.ng.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_8760db995d144d1cab8bb99f8e30e4d7.get_object(Bucket='textanalyticscourse291b8c2b2fd34be0aaeaf42d0c9becc5',Key='textdata.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()



Unnamed: 0,Text
0,"The ""Big Brother"" of Singapore football will b..."
1,Mahfizur Rahman watched his friends turn to cr...
2,"The going has been tough, but the Football Ass..."
3,Having pushed reigning world and European cham...
4,SINGAPORE - Registration for the Standard Char...


## Convert each doc into list of tokens and append it to a list

In [40]:
import nltk
docs=[]
for idx, row in df_data_1.iterrows():
    #print (row['Text'])
    
    tokens = nltk.word_tokenize(row['Text'])
    text = nltk.Text(tokens)
    docs.append(tokens)
print ((docs))
    

[['The', '``', 'Big', 'Brother', "''", 'of', 'Singapore', 'football', 'will', 'be', 'back', ',', 'but', 'not', 'immediately', ',', 'and', 'not', 'for', 'long', '.', 'In', 'an', 'exclusive', 'interview', 'with', 'The', 'New', 'Paper', ',', 'Persib', 'Bandung', 'striker', 'Noh', 'Alam', 'Shah', 'said', 'he', 'has', 'agreed', 'to', 'sign', 'a', 'short-term', 'deal', 'with', 'former', 'club', 'Tampines', 'Rovers', 'until', 'the', 'end', 'of', 'the', 'season', '.', 'But', 'the', '31-year-old', 'said', ':', '``', 'Beyond', 'that', ',', 'I', 'feel', 'my', 'future', 'is', 'still', 'in', 'Indonesia', '.', '``', 'I', 'feel', 'really', 'appreciated', 'here', '.', 'Four', 'Indo', 'clubs', 'already', 'made', 'me', 'offers', 'for', 'the', 'next', 'season', ',', 'which', 'starts', 'next', 'January', '.', "''", 'The', 'move', 'to', 'Singapore', 'still', 'hinges', 'on', 'whether', 'Tampines', 'can', 'secure', 'his', 'medical', 'documents', 'and', 'International', 'Transfer', 'Certificate', 'from', 'the

In [42]:
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer=PorterStemmer()
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models


def tolower(docs):
    docs=[[w.lower() for w in doc] for doc in docs]
    return docs
    
def fetchdictionary(docs):
    dictionary=corpora.Dictionary(docs)
    return dictionary

def removestop(docs):
    stop_list=stopwords.words('english')
    docs=[[w for w in doc if w not in stop_list] for doc in docs]
    return docs;

def stemwords(docs):
    docs=[[stemmer.stem(w) for w in doc] for doc in docs]
    
    #text2_stemmed=[stemmer.stem(w) for w in wordlist]
    return docs;

def convertToVec(docs,dictionary):
    vecs=[dictionary.doc2bow(doc) for doc in docs]
    return vecs

def buildindex(docs):
    index=similarities.SparseMatrixSimilarity(docs,110)
    return index;

def createtdif(docs):
    tfidf=models.TfidfModel(docs)
    return tfidf

In [44]:
docs=tolower(docs)
#print(docs)
#Remove stop words
docs=removestop(docs)
#Perform stemming
docs=stemwords(docs)

#Create dictionary
dictionary=fetchdictionary(docs)
print (dictionary)
token_to_id=dictionary.token2id
#Convert to vector
#print (type(docs))
vecs=convertToVec(docs,dictionary)
print (vecs)
#Build index for finding similarity
index=buildindex(vecs)
#print(index)

tdif=createtdif(vecs)
print (tdif)

Dictionary(734 unique tokens: ['main', 'better', 'menac', 'averag', 'thailand']...)
[[(0, 3), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 3), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 2), (15, 14), (16, 2), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 4), (23, 6), (24, 1), (25, 1), (26, 1), (27, 1), (28, 3), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 3), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 3), (50, 1), (51, 1), (52, 5), (53, 1), (54, 1), (55, 1), (56, 8), (57, 3), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 2), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 3), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 4), (94, 1), (95, 7), (96, 1), (97, 1), (98, 30), (99, 1), (100, 1), (101, 1), (1

## Call the K means function defined earlier to create clusters. vecs is the sparse vector, second parameter is the number of  unique tokens in the dictionary adn last parameter is the number of clusters you want to create. 

In [46]:
clusters=k_means(vecs,734,2)

## Print Clusters to see the document list

In [48]:
cluster1=clusters[0]
cluster2=clusters[1]
print(cluster1)
print (cluster2)

[0, 1, 2, 3]
[4]
