## We will challenge the clustering of survey data mainly represented by 0/1 data.
---
* Find number of significant components
* Calc correlation without noise
* PCA loadings
  * Investigation of column-side aggregation
* PCA scores
  * Investigation of number of clusters

In [None]:
!pip install git+https://github.com/darecophoenixx/wordroid.sblo.jp

In [None]:
mkdir img

In [None]:
%matplotlib inline
from IPython.display import SVG, Image
from tensorflow.keras.utils import model_to_dot

In [None]:
from feature_eng import m01f

In [None]:
import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim

from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
from sklearn import mixture
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

import gensim
from tensorflow.keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

In [None]:
tgt_dir = ''
filename = 'ClusteringOfQuantitativeSurveyData'

img_cnt = 0
def save_img():
    global img_cnt
    img_cnt += 1
    img_name = 'img_{0:03}.jpeg'.format(img_cnt)
    img_name = os.path.join(tgt_dir, 'img', filename+'_'+img_name)
    print(img_name)
    plt.savefig(img_name)

# Load Sample Data
---

In [None]:
X_df = pd.read_csv('../input/sample-data-wordanddoc2vec/sample007.csv', index_col=0)
print(X_df.shape)
X_df.head()

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(X_df.values.T)
save_img()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(4, 15))
ax = sns.heatmap(X_df.values, vmin=-1, vmax=1, cmap='coolwarm',
                 #annot=True, fmt='.2f', annot_kws={'size': 8},
                 ax=ax)
save_img()

# Find number of significant components
---
* compares the eigen values of the observed data with that of a random data matrix of the same size as the original.

In [None]:
_ = m01f.find_ncomponents_pca(X_df.values)

# The number of significant components is [4]
---

In [None]:
n_sig = 4

# Calc correlation without noise
---
```python
def calc_cor_nonoise(c, n_sig=3):
    try:
        w, v = np.linalg.eig(c)
    except Exception as e:
        print(e)
        c1 = cor_smooth(c)
        w, v = np.linalg.eig(c1)
        print('"Matrix was not positive definite, smoothing was done"')
    m = v[:,:n_sig].dot(np.diag(w[:n_sig])).dot(v[:,:n_sig].T)
    d = np.sqrt(np.diag(m)).reshape((v.shape[0],1))
    m = m / d / d.T
    return m
```

In [None]:
cor_nonoise = m01f.calc_cor_nonoise(X_df.corr(), n_sig=n_sig)
cor_nonoise.shape

## Show correlation

In [None]:
f, ax = plt.subplots(1, 1, figsize=(17, 15))
ax = sns.heatmap(cor_nonoise, vmin=-1, vmax=1, cmap='coolwarm',
                 annot=True, fmt='.2f', annot_kws={'size': 8},
                 ax=ax)
save_img()

## Plot correlation of original data

In [None]:
f, ax = plt.subplots(1, 1, figsize=(17, 15))
ax = sns.heatmap(X_df.corr(), vmin=-1, vmax=1, cmap='coolwarm',
                 annot=True, fmt='.2f', annot_kws={'size': 8},
                 ax=ax)
save_img()

# PCA loadings
---

## original data

In [None]:
pca = PCA(n_components=n_sig)
pca.fit(X_df)

In [None]:
df = pd.DataFrame(pca.components_.T)
sns.set_context('paper')
g = sns.PairGrid(df, height=2.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

## Loadings of correlation without noise
```python
def calc_loadings_nonoise(c):
    w, v = np.linalg.eigh(c)
    idx = np.argsort(w)[::-1]
    w, v = w[idx], v[:,idx]
    return v
```

In [None]:
loadings_nonoise = m01f.calc_loadings_nonoise(cor_nonoise, n_sig=n_sig)
print(loadings_nonoise.shape)
df = pd.DataFrame(loadings_nonoise)
sns.set_context('paper')
g = sns.PairGrid(df, height=2.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

# Investigation of column-side aggregation
---

In [None]:
%%time
df = pd.DataFrame(loadings_nonoise)
res = m01f.mclust(df)

m01f.plot_mclust(res, figsize=(7,7))
save_img()

## Specify the number of groups
* it seems to be [5]

In [None]:
'''specify the number of groups (n_components=)'''
gm = mixture.GaussianMixture(n_components=5, init_params='kmeans', n_init=30)
gm.fit(loadings_nonoise)
np.argmax(gm.predict_proba(loadings_nonoise), axis=1)

## Probability of which group each variable belongs to

In [None]:
'''Probability of which group each variable belongs to'''
df_z = pd.DataFrame(gm.predict_proba(loadings_nonoise), index=X_df.columns)
df_z.style.background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1).set_precision(1)

## Plot loadings without noise (by group)

In [None]:
df = pd.DataFrame(loadings_nonoise)
df['cls'] = ['cls'+str(ee) for ee in np.argmax(gm.predict_proba(loadings_nonoise), axis=1)]
sns.set_context('paper')
sns.pairplot(df, markers='o', hue='cls', height=2.5, diag_kind='hist')
save_img()

## Factor Analysis

In [None]:
fa = FactorAnalysis(n_components=5)
fa.fit(X_df)
fa.components_.shape

In [None]:
df = pd.DataFrame(fa.components_.T, index=X_df.columns)
df.style.background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1).set_precision(2)

# Clustering
---

## Calc X_df without noise
```python
def calc_mat_nonoise(mat, n_sig=3):
    ss = StandardScaler()
    ss.fit(mat)
    x_sc = ss.transform(mat)
    u, s, vh = np.linalg.svd(x_sc)
    x_sc2 = u[:,:n_sig].dot(np.diag(s[:n_sig] * s.sum() / s[:n_sig].sum())).dot(vh[:n_sig])
    return ss.inverse_transform(x_sc2)
```

In [None]:
mat_nonoise = m01f.calc_mat_nonoise(X_df.values, n_sig=n_sig)
X_df_nonoise = pd.DataFrame(mat_nonoise, index=X_df.index, columns=X_df.columns)
X_df_nonoise.shape

## Correlation of X_df_nonoise

In [None]:
f, ax = plt.subplots(1, 1, figsize=(17, 15))
ax = sns.heatmap(X_df_nonoise.corr(), vmin=-1, vmax=1, cmap='coolwarm',
                 annot=True, fmt='.2f', annot_kws={'size': 8},
                 ax=ax)
save_img()

## Calc scores

In [None]:
scores_nonoise = cosine_similarity(X_df_nonoise.values, loadings_nonoise.T)
scores_nonoise.shape

In [None]:
df = pd.DataFrame(scores_nonoise)
sns.set_context('paper')
g = sns.PairGrid(df, height=2.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

In [None]:
df1 = pd.DataFrame(scores_nonoise[:,:5])
df1['cls'] = 'row'
df2 = pd.DataFrame(loadings_nonoise[:,:5])
df2['cls'] = 'col'
df = pd.concat([df1, df2])
sns.set_context('paper')
sns.pairplot(df, markers=['.']+['s'], hue='cls', height=2.5, diag_kind='hist')
save_img()

## Plot PCA scores of original data

In [None]:
df = pd.DataFrame(pca.transform(X_df))
sns.set_context('paper')
g = sns.PairGrid(df, height=2.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

## Investigation of number of clusters

In [None]:
%%time
res = m01f.mclust(scores_nonoise)
m01f.plot_mclust(res, figsize=(7,7))
save_img()

# The number of cluster : [5]
---

In [None]:
gm = mixture.GaussianMixture(n_components=5, init_params='kmeans', n_init=30)
gm.fit(scores_nonoise)
np.argmax(gm.predict_proba(scores_nonoise), axis=1)

In [None]:
df_org2 = X_df.copy()
df_org2['cls'] = np.argmax(gm.predict_proba(scores_nonoise), axis=1)
#df_org2

In [None]:
cnt = df_org2.groupby('cls').size()
cnt

In [None]:
ll = []
for ee in X_df.columns.values:
    tmp = pd.crosstab(df_org2['cls'], X_df[ee], margins=True, normalize='index')
    ll.append(tmp.values[:,1])

cross_tab = pd.DataFrame(np.c_[ll], index=X_df.columns, columns=tmp.index)
cross_tab.style.background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1).set_precision(1)

In [None]:
df = pd.DataFrame(scores_nonoise)
df['cls'] = ['cls'+str(ee) for ee in df_org2.cls.values]
sns.set_context('paper')
sns.pairplot(df, markers='o', hue='cls', height=2.5, diag_kind='hist')
save_img()

# X_df is the following data shuffled in the row direction
---

In [None]:
idx = np.argsort([int(re.sub('^r', '', ee)) for ee in X_df.index.values])
plt.figure(figsize=(15, 15))
plt.imshow(X_df.iloc[idx].values.T)
save_img()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(4, 15))
ax = sns.heatmap(X_df.iloc[idx].values, vmin=-1, vmax=1, cmap='coolwarm',
                 #annot=True, fmt='.2f', annot_kws={'size': 8},
                 ax=ax)
save_img()