# Data

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = pd.read_csv('toi_2010_18.csv')

In [3]:
len(data)

1228

In [4]:
data.head()

Unnamed: 0,DATE,AUTHOR,IMAGE,TEXT,KEYWORDS,SUMMARY
0,8-1-2010,[],https://static.toiimg.com/photo/msid-5423538/5...,"Junali Devi, a widow living in Assam, is eager...","['hivaids', 'hiv', 'website', 'positive', 'spe...","Junali Devi, a widow living in Assam, is eager..."
1,11-1-2010,['Karthika Gopalakrishnan'],https://static.toiimg.com/photo/msid-47529300/...,CHENNAI: Researchers working towards a cure fo...,"['hiv', 'schooley', 'hopeful', 'research', 're...",These cells have a second molecule called CCR5...
2,12-1-2010,[],https://static.toiimg.com/photo/msid-47529300/...,LUCKNOW: After running from the pillar to post...,"['hiv', 'ngos', 'money', 'shelter', 'poonam', ...",The hapless woman was thrown out by her husban...
3,30-1-2010,[],https://static.toiimg.com/photo/msid-47529300/...,"BANGALORE: Nutrition, education and property r...","['hivaids', 'hiv', 'positive', 'aids', 'childr...","BANGALORE: Nutrition, education and property r..."
4,1-2-2010,['Kounteya Sinha'],https://static.toiimg.com/photo/msid-47529300/...,NEW DELHI: A vaccine to protect HIV patients f...,"['hiv', 'mv', 'aids', 'tb', 'hivinfected', 'pa...",NEW DELHI: A vaccine to protect HIV patients f...


# K-means Clustering

In [5]:
corpus = data['TEXT']

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

In [7]:
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [8]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
labels = model.predict(X)

In [9]:
for i in range(true_k):
    
    print('\n')
    print("Cluster %d:" % i)    
    print('\n')
    
    for ind in order_centroids[i, :10]:
        print( '%s' % terms[ind])



Cluster 0:


hiv
patients
art
aids
said
people
treatment
positive
centres
health


Cluster 1:


children
hiv
school
said
positive
parents
child
students
people
aids


Cluster 2:


hiv
sex
aids
cases
women
said
state
prevalence
number
positive


Cluster 3:


hospital
police
hiv
woman
said
positive
patient
husband
court
medical


Cluster 4:


blood
transfusion
banks
hiv
cbi
hospital
bank
test
court
probe


In [10]:
len(labels)

1228

In [11]:
kmeans = pd.DataFrame(labels)
data.insert((data.shape[1]),'kmeans',kmeans)

In [12]:
data.head()

Unnamed: 0,DATE,AUTHOR,IMAGE,TEXT,KEYWORDS,SUMMARY,kmeans
0,8-1-2010,[],https://static.toiimg.com/photo/msid-5423538/5...,"Junali Devi, a widow living in Assam, is eager...","['hivaids', 'hiv', 'website', 'positive', 'spe...","Junali Devi, a widow living in Assam, is eager...",1
1,11-1-2010,['Karthika Gopalakrishnan'],https://static.toiimg.com/photo/msid-47529300/...,CHENNAI: Researchers working towards a cure fo...,"['hiv', 'schooley', 'hopeful', 'research', 're...",These cells have a second molecule called CCR5...,0
2,12-1-2010,[],https://static.toiimg.com/photo/msid-47529300/...,LUCKNOW: After running from the pillar to post...,"['hiv', 'ngos', 'money', 'shelter', 'poonam', ...",The hapless woman was thrown out by her husban...,1
3,30-1-2010,[],https://static.toiimg.com/photo/msid-47529300/...,"BANGALORE: Nutrition, education and property r...","['hivaids', 'hiv', 'positive', 'aids', 'childr...","BANGALORE: Nutrition, education and property r...",1
4,1-2-2010,['Kounteya Sinha'],https://static.toiimg.com/photo/msid-47529300/...,NEW DELHI: A vaccine to protect HIV patients f...,"['hiv', 'mv', 'aids', 'tb', 'hivinfected', 'pa...",NEW DELHI: A vaccine to protect HIV patients f...,0


In [13]:
data.to_csv('kmean.csv')