In [None]:
#!/usr/bin/env python
# coding: utf-8

# 

In [None]:

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import shuffle
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 5)

RAND_INIT = 123456

In [None]:
def get_alg(name, k, random_state, max_iter, extra_params={}):
    if name == 'km':
        return KMeans(n_clusters=k, random_state=random_state, max_iter=max_iter, **extra_params)
    elif name == 'gm':
        return GaussianMixture(n_components=k, random_state=random_state, max_iter=max_iter, **extra_params)
    elif name == 'db':
        return DBSCAN(eps=2.4, **extra_params)
    else:
        raise Exception('unknown alg')


In [None]:
def norm_X(df):
    real_X = df.iloc[:, 4:].as_matrix().astype(float)

    for i in range(real_X.shape[1]):
        col = real_X[:, i]
        col_std = np.std(col)
        col_mean = np.mean(col)
        real_X[:, i] = (col - col_mean) / col_std

    return real_X


def tfidf_X(df):
    real_X = df.iloc[:, 4:].as_matrix()
    trf = TfidfTransformer()
    real_X = trf.fit_transform(real_X).toarray()

    return real_X


def norm_tfidf_X(df):
    real_X = df.iloc[:, 4:].as_matrix()
    trf = TfidfTransformer()
    real_X = trf.fit_transform(real_X).toarray()

    for i in range(real_X.shape[1]):
        col = real_X[:, i]
        col_std = np.std(col)
        col_mean = np.mean(col)
        real_X[:, i] = (col - col_mean) / col_std

    return real_X



In [None]:
def cls_explore(df, y):
    df_clusters = df.copy()
    df_clusters['cluster'] = y
    interests = df_clusters.columns[4:-1]
    aggs = {key: np.sum for key in interests}
    aggs['cluster'] = {'size': len}

    res = df_clusters.groupby('cluster').agg(aggs)
    res.columns = res.columns.get_level_values(0)

    for interest in interests:
        #     res[interest] = res[interest] / res['cluster'] * 100000
        #     col_std = np.std(res[interest])
        #     col_mean = np.mean(res[interest])
        #     res[interest] = (res[interest] - col_mean) / col_std

        res[interest] = res[interest] / np.sum(res[interest]) / res['cluster'] * 100000
    #     res[interest] = res[interest] / np.sum(res[interest])

    from IPython import display
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display.display(res)

    for i, row in enumerate(res.as_matrix().argsort(axis=1)):
        top_inds = row[::-1][1:6]
        print('Кластер {} : {}'.format(i, ' '.join(res.columns[top_inds])))

In [None]:
def k_plot(X, alg_name, quality, k_start, k_end, random_state, max_iter):
    ks = range(k_start, k_end + 1)
    criteria = np.empty(len(ks))

    for i, k in enumerate(ks):
        x = X[:]
        cls = get_alg(alg_name, k=k, random_state=random_state, max_iter=max_iter)
        yl = cls.fit(x).predict(x)
        x, yl = shuffle(x, yl, random_state=RAND_INIT)
        criteria[i] = quality(x[:10000], yl[:10000])

    plt.figure(figsize=(8, 6))
    plt.plot(ks, criteria)
    plt.title("$J(k)$")
    plt.ylabel("Criteria $J$")
    plt.xlabel("Number of clusters $k$")
    plt.grid()
    plt.show()

In [None]:

def quality(x, y):
    x, y = shuffle(x, y, random_state=RAND_INIT)
    return silhouette_score(x[:10000], y[:10000])



df_sns = pd.read_csv('snsdata.csv', sep=',')
df_sns = df_sns.drop(df_sns.index[24337])
df_sns = df_sns.drop(df_sns.index[14187])
df_sns = df_sns.drop(df_sns.index[13224])
df_sns = df_sns.drop(df_sns.index[11195])
df_sns = df_sns.drop(df_sns.index[4950])
df_sns = df_sns.drop(df_sns.index[4129])
df_sns = df_sns.drop(df_sns.index[1777])
df_sns = df_sns.drop(df_sns.index[1169])

In [None]:

X = norm_X(df_sns)

from sklearn.model_selection import train_test_split

In [None]:
X, X_test = train_test_split(
    X, test_size=0.25, random_state=RAND_INIT)


In [None]:
df_sns[:10].T[22:]
df_sns.head().T[18:25]


In [None]:
kmeans = get_alg('km', k=8, random_state=RAND_INIT, max_iter=300)
get_ipython().run_line_magic('time', 'kmeans.fit(X)')
y = kmeans.predict(X_test)



In [None]:
get_ipython().run_line_magic('time', 'quality(X_test, y)')



In [None]:
gm = get_alg('gm', k=8, random_state=RAND_INIT, max_iter=300)
get_ipython().run_line_magic('time', 'gm.fit(X)')
y = gm.predict(X_test)


In [None]:
get_ipython().run_line_magic('time', 'quality(X_test, y)')


In [None]:
db = DBSCAN(eps=2.4)
get_ipython().run_line_magic('time', 'db.fit(X[:30000])')
y = db.labels_


represent = df_sns.iloc[:, :4]
represent['cluster'] = y

rg = represent.groupby('cluster').agg({'cluster': {'size': len}})
rg.columns = rg.columns.get_level_values(1)
rg.sort_values(by=['size'], ascending=False).reset_index()
# rg.sort_values(by=['size'], ascending=False).reset_index().drop(columns='cluster')

print(represent)

In [None]:
# km
cls_explore(df_sns, y)


In [None]:
get_ipython().run_line_magic('time', 'silhouette_score(X[:10000], y[:10000])')


In [None]:
get_ipython().run_line_magic('time',
                             "k_plot(X=X,     alg_name='km',     quality=silhouette_score,     k_start=2,     k_end=16,     random_state=RAND_INIT,     max_iter=300 )")


In [None]:
get_ipython().run_line_magic('time',
                             "k_plot(X=X,     alg_name='gm',     quality=silhouette_score,     k_start=2,     k_end=16,     random_state=RAND_INIT,     max_iter=300 )")


In [None]:
# db
# 0.45324793458963125
# 0.017228049478748612
get_ipython().run_line_magic('time', 'quality(X[:10000], y[:10000])')

In [None]:
with open('idf.data', 'w') as f:
    np.save(f, X)
with open('idf.labels', 'w') as f:
    np.save(f, kmeans.labels_)