In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")


In [4]:
df = pd.read_csv('data/generic.csv')

In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,id
0,87,@CaesarsAC looking forward to our first stay! ...,travel,967
1,479,A nice vegetable lasagne.\nFantastic prize! Th...,win,1359
2,478,Enter to #win A City Through Time and A Street...,win,1358
3,209,Just a bit has changed in 4 years of marriage ...,love,1089
4,212,God will wreck your plans when He sees that yo...,love,1092


In [6]:
df.shape

(500, 4)

In [7]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

df['class'] = label_encoding.fit_transform(df['class'].astype(str))

df.head()

Unnamed: 0.1,Unnamed: 0,text,class,id
0,87,@CaesarsAC looking forward to our first stay! ...,3,967
1,479,A nice vegetable lasagne.\nFantastic prize! Th...,4,1359
2,478,Enter to #win A City Through Time and A Street...,4,1358
3,209,Just a bit has changed in 4 years of marriage ...,0,1089
4,212,God will wreck your plans when He sees that yo...,0,1092


In [8]:
df_labels = df['class']

df_labels.sample(10)

98     4
122    4
167    2
96     1
157    1
260    1
417    1
154    2
203    0
204    3
Name: class, dtype: int64

In [9]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

In [10]:
def k_means(data, n_clusters=5, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [11]:
def agglomerative_fn(data, n_clusters=5):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [12]:
def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return model

In [13]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

In [14]:
def birch_fn(data, n_clusters=5):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model

In [15]:
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model

In [16]:
def mini_batch_kmeans_fn(data, n_clusters=5, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    return model

### TFIDF

In [17]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english')
df_tfidf=tfidf_vectorizer.fit_transform(df['text']) 

In [18]:
df_tfidf

<500x3130 sparse matrix of type '<class 'numpy.float64'>'
	with 6112 stored elements in Compressed Sparse Row format>

In [19]:
build_model(k_means, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.613	0.679	0.645	0.567	0.641	0.015


In [20]:
build_model(agglomerative_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.176	0.324	0.228	0.086	0.217	0.010


In [21]:
build_model(dbscan_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.016	0.282	0.031	0.001	0.020	0.008


In [22]:
build_model(mean_shift_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.998	0.260	0.413	0.001	0.012	0.021


In [23]:
build_model(birch_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.299	0.407	0.345	0.166	0.337	0.005


In [24]:
build_model(affinity_propagation_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.685	0.283	0.401	0.163	0.321	0.040


In [25]:
build_model(mini_batch_kmeans_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.350	0.609	0.444	0.190	0.437	0.004


### Doc2Vec

In [27]:
from d2v_pre import d2v_pre
from getEmbeddings import getEmbeddings
import numpy as np

d2v_pre('data/generic.csv')

x,y = getEmbeddings('data/generic_doc2vec.csv')

In [28]:
def doc2vec_build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_)))

In [29]:
doc2vec_build_model(k_means, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.013	0.013	0.013	0.003	0.003


In [30]:
doc2vec_build_model(agglomerative_fn, x,y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.008	0.009	0.008	-0.002	-0.002


In [31]:
doc2vec_build_model(dbscan_fn, x,y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.000	1.000	0.000	0.000	0.000


In [32]:
doc2vec_build_model(mean_shift_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.000	1.000	0.000	0.000	0.000


In [33]:
doc2vec_build_model(birch_fn,x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.000	1.000	0.000	0.000	0.000


In [34]:
doc2vec_build_model(affinity_propagation_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.130	0.057	0.079	0.002	0.008


In [35]:
doc2vec_build_model(mini_batch_kmeans_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.010	0.076	0.018	-0.000	-0.003
