In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('data/sports.csv')

In [4]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,text,class,id
0,PLEASE PLEASE PLEASE #HOCKEY,hockey,1364
1,Prediction Record for Feb. 3rd: 2-1\nSeason Re...,hockey,1297
2,Having been turned away by @wirralgolfclub we ...,golf,939
3,#TableTennis For Pawel Grela V Adam Duch on Th...,tabletennis,1032
4,My oh my Patrick Reed is not a happy camper. #...,golf,1104


In [5]:
df.shape

(2500, 3)

In [6]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

df['class'] = label_encoding.fit_transform(df['class'].astype(str))

df.head()

Unnamed: 0,text,class,id
0,PLEASE PLEASE PLEASE #HOCKEY,2,1364
1,Prediction Record for Feb. 3rd: 2-1\nSeason Re...,2,1297
2,Having been turned away by @wirralgolfclub we ...,1,939
3,#TableTennis For Pawel Grela V Adam Duch on Th...,4,1032
4,My oh my Patrick Reed is not a happy camper. #...,1,1104


In [7]:
df_labels = df['class']

df_labels.sample(10)

540     3
1454    1
1160    2
1723    0
1413    1
2092    0
611     0
1537    4
736     3
1974    2
Name: class, dtype: int64

In [8]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

In [9]:
def k_means(data, n_clusters=5, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [10]:
def agglomerative_fn(data, n_clusters=5):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [11]:
def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return model

In [12]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

In [13]:
def birch_fn(data, n_clusters=5):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model

In [14]:
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model

In [15]:
def mini_batch_kmeans_fn(data, n_clusters=5, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    return model

### TFIDF

In [16]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english')
df_tfidf=tfidf_vectorizer.fit_transform(df['text']) 

In [17]:
df_tfidf

<2500x10666 sparse matrix of type '<class 'numpy.float64'>'
	with 35853 stored elements in Compressed Sparse Row format>

In [18]:
build_model(k_means, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.718	0.801	0.757	0.594	0.757	0.022


In [19]:
build_model(agglomerative_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.218	0.521	0.308	0.088	0.306	0.020


In [31]:
build_model(dbscan_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.008	0.229	0.016	0.000	0.012	0.004


In [32]:
build_model(mean_shift_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
1.000	0.210	0.347	0.002	0.034	0.049


In [33]:
build_model(birch_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.372	0.548	0.443	0.222	0.442	0.019


In [34]:
build_model(affinity_propagation_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.730	0.216	0.333	0.037	0.272	0.050


In [35]:
build_model(mini_batch_kmeans_fn, df_tfidf.toarray(), df_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.231	0.570	0.329	0.103	0.327	0.009


### Doc2Vec

In [22]:
from getEmbeddings import getEmbeddings
from d2v_pre import d2v_pre

d2v_pre('data/sports.csv')

x,y = getEmbeddings('data/sports_doc2vec.csv')

In [23]:
def doc2vec_build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_)))

In [24]:
doc2vec_build_model(k_means, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.044	0.048	0.046	0.028	0.044


In [25]:
doc2vec_build_model(agglomerative_fn, x,y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.040	0.047	0.043	0.024	0.041


In [26]:
doc2vec_build_model(dbscan_fn, x,y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
-0.000	1.000	-0.000	0.000	-0.000


In [27]:
doc2vec_build_model(mean_shift_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
-0.000	1.000	-0.000	0.000	-0.000


In [28]:
doc2vec_build_model(birch_fn,x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
-0.000	1.000	-0.000	0.000	-0.000


In [29]:
doc2vec_build_model(affinity_propagation_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.106	0.036	0.054	0.002	0.020


In [30]:
doc2vec_build_model(mini_batch_kmeans_fn, x, y)

homo	compl	v-meas	ARI	AMI
--------------------------------------------------
0.039	0.065	0.048	0.031	0.046
