In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples,silhouette_score, davies_bouldin_score
from sklearn.preprocessing import power_transform, MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler
from scipy.stats import boxcox, skew
from sklearn.decomposition import PCA, KernelPCA

In [3]:
df = pd.read_csv('../../algoRev1/mockData/profilesNoBio.csv')
X = df.to_numpy()
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Major_Dept,Ambition,Goals,Music,Sports,Friday_Nights,School_balance,Movie_Genres,Social_Style,Transfer
0,0,4,8,3,1,4,1,2,1,7,3,2
1,1,3,3,1,2,8,1,1,2,4,2,1
2,2,1,7,2,2,2,10,1,3,11,2,2
3,3,4,5,1,1,8,3,2,2,6,3,2
4,4,2,4,2,1,3,9,2,2,7,1,2


In [4]:
def allTheNormalizers(X):
    X_norm = power_transform(X, method='yeo-johnson', standardize=True, copy=True)
    X_mas = MaxAbsScaler().fit_transform(X)
    X_minmax = MinMaxScaler().fit_transform(X)
    X_stand = StandardScaler().fit_transform(X)
    X_robus = RobustScaler().fit_transform(X)

    allOfThem = {
        'X_norm': {'s':X_norm},
        'X_mas': {'s':X_mas},
        'X_minmax': {'s':X_minmax},
        'X_stand': {'s':X_stand},
        'X_robus': {'s':X_robus}
        }
    return allOfThem

def pcaAll(allOfThem):
    pca = PCA(n_components = 0.99)

    for X in allOfThem.keys():
        pcad = pca.fit_transform(allOfThem[X]['s'])
        allOfThem[X]['pca']= pcad
    return allOfThem

def hacForNumClusters(X,n):
    for cluster_num in range(2,n):
        hac = AgglomerativeClustering(n_clusters = cluster_num)
        labels = hac.fit_predict(X)
        sil = silhouette_score(X, labels)
        db = davies_bouldin_score(X, labels)
        print('clus {}: {}, {}'.format(cluster_num, sil, db))
    print('\n')
    
def hacForAll(allOfThem):
    for X in allOfThem.keys():
        print(X)
        hacForNumClusters(allOfThem[X]['pca'],30)

In [6]:
allOfThem = allTheNormalizers(X)
allOfThem2 = pcaAll(allOfThem)
hacForAll(allOfThem2)

X_norm
clus 2: 0.04113851193827677, 4.330571823400186
clus 3: 0.0336920056423997, 4.314711253981897
clus 4: 0.03239452797617588, 3.6979536805954774
clus 5: 0.02700785615120672, 3.505154116929526
clus 6: 0.02880904838849914, 3.426285887903672
clus 7: 0.031091480871961202, 3.1435770977363546
clus 8: 0.03125962779960672, 3.0366259078684643
clus 9: 0.03233399214053166, 2.896579705131559
clus 10: 0.03395305377405382, 2.769826609527695
clus 11: 0.03042155315344896, 2.705931705691
clus 12: 0.03144985612241269, 2.6098356278124406
clus 13: 0.03293613952671352, 2.558361100589797
clus 14: 0.03615997987026932, 2.5267623825391277
clus 15: 0.03640849493197869, 2.5015103503560088
clus 16: 0.03750926299486978, 2.4340252430093168
clus 17: 0.040287211914447764, 2.4453690220794946
clus 18: 0.037057480750657096, 2.383302641898928
clus 19: 0.035973356218702596, 2.3462939845728727
clus 20: 0.03832078918141002, 2.2960737786585392
clus 21: 0.04026222039069634, 2.2809128188027135
clus 22: 0.03785275660584055, 