In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples,silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox, skew
from sklearn.decomposition import PCA, KernelPCA

In [25]:
def hacForNumClusters(X,n):
    for cluster_num in range(2,n):
        hac(X,cluster_num)
        
def hac(X,cluster_num):
    hac = AgglomerativeClustering(n_clusters = cluster_num)
    labels = hac.fit_predict(X)
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)
    print('clus {}: {}, {}'.format(cluster_num, sil, db))
    return labels
        
def cluster(filepath, num_clus, mult=False):
    df = pd.read_csv(filepath)
    df = df.drop(['Bios'], axis=1)
    
    X = df.to_numpy()
    #minmax scaling
    X_minmax = MinMaxScaler().fit_transform(X)
    
    #dimensionality reduction
    pca = PCA(n_components = 0.99)
    X = pca.fit_transform(X_minmax)
    
    if mult:
        hacForNumClusters(X,num_clus)
    else:
        labels = hac(X,num_clus)
        return labels

In [26]:
print('dataset 1')
labels1 = cluster('./TEST1.csv',30,True)

print('dataset 2')
labels2 = cluster('./TEST2.csv',30,True)

dataset 1
clus 2: 0.1311097924548254, 2.523718796851532
clus 3: 0.11350183171234078, 2.2757643143609925
clus 4: 0.14534076344714095, 2.3041647128305023
clus 5: 0.12320377464402427, 2.121339691574551
clus 6: 0.1275179232263156, 2.0207430460933256
clus 7: 0.14704446279755468, 2.0778900347390574
clus 8: 0.16314571503828834, 2.0841849447444365
clus 9: 0.1504842018850041, 2.1128459750676942
clus 10: 0.14372754804549118, 2.0748823733230566
clus 11: 0.12967490803579237, 2.077398102070468
clus 12: 0.11832964096403796, 2.1589801052345945
clus 13: 0.1173143982540348, 2.2308298722974507
clus 14: 0.11125248853191569, 2.2778935785983645
clus 15: 0.10580318411999981, 2.277235846905591
clus 16: 0.10658404563054125, 2.2952460158294894
clus 17: 0.10665230251282798, 2.2283954064427185
clus 18: 0.10697184244834382, 2.1831869212699657
clus 19: 0.10488701755371797, 2.2158874283456726
clus 20: 0.10442376431722247, 2.226255295150084
clus 21: 0.10440408669174732, 2.1774232022437423
clus 22: 0.1053272806406871

In [32]:
print('dataset 1')
labels1 = cluster('./TEST1.csv',8)

print('dataset 2')
labels2 = cluster('./TEST2.csv',8)

dataset 1
clus 8: 0.16314571503828834, 2.0841849447444365
dataset 2
clus 8: 0.1544244944427472, 2.1163278440328614


In [53]:
import os

#os.mkdir(os.getcwd() + '/' + someWord)
def please(filepath, labels, n, someWord):
    df_ori = pd.read_csv(filepath)
    df_ori['cluster #'] = labels
    #os.mkdir(os.getcwd() + '/' + someWord)
    
    
    for cluster in range(n):
        yes = df_ori[df_ori['cluster #'] == cluster]
        filename = '{}/{}.csv'.format(someWord, str(cluster))
        yes.to_csv(filename)
        
    

In [54]:
please('./TEST1.csv',labels1,8,'woohoo1')

please('./TEST2.csv',labels2,8,'woohoo2')

clustering the clusters

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def nlp_clus(filename, n, desc=False):
    df = pd.read_csv(filename)
    # Instantiating the Vectorizer, experimenting with both
    vectorizer = CountVectorizer()
    #vectorizer = TfidfVectorizer()

    # Fitting the vectorizer to the Bios
    x = vectorizer.fit_transform(df['Bios'])

    # Creating a new DF that contains the vectorized words
    df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

    # Concating the words DF with the original DF
    new_df = pd.concat([df, df_wrds], axis=1)

    # Dropping the Bios because it is no longer needed in place of vectorization
    new_df.drop('Bios', axis=1, inplace=True)

    # Instantiating PCA
    pca = PCA()

    # Fitting and Transforming the DF
    df_pca = pca.fit_transform(new_df)

    # Reducing the dataset to the number of features determined before
    pca = PCA(n_components=0.99)

    # Fitting and transforming the dataset to the stated number of features and creating a new DF
    df_pca = pca.fit_transform(new_df)

    # Seeing the variance ratio that still remains after the dataset has been reduced
    print(pca.explained_variance_ratio_.cumsum()[-1])
    
    labels = cluster(filename,n,False)
    return labels

In [81]:
def nlp_cluster_for_cluster_spec(dir_spec):
    dirname = '{}/{}'.format(os.getcwd(),dir_spec)
    for filename in os.listdir(dirname):
        labels = nlp_clus('{}/{}'.format(dirname, filename), 30, True)

In [83]:
nlp_cluster_for_cluster_spec('./woohoo1')

0.9984505754157542
clus 2: 0.09281559688201381, 2.648893675067007
clus 3: 0.09370652810575969, 2.278291998173438
clus 4: 0.09101351537390083, 2.0695269875470514
clus 5: 0.10545131959603032, 1.8744987677574483
clus 6: 0.10903373482777731, 1.6883502928282772
clus 7: 0.12510213330397915, 1.6182800098023566
clus 8: 0.12397210235821858, 1.5269026827052337
clus 9: 0.12545079115807076, 1.47782035222056
clus 10: 0.13288180286111717, 1.4111439477308152
clus 11: 0.1346431371650235, 1.439164905359859
clus 12: 0.1453484444361042, 1.393878611039578
clus 13: 0.15573235784162306, 1.3475912422785956
clus 14: 0.15796511180207543, 1.274121638704496
clus 15: 0.1619404112575749, 1.2388273893326451
clus 16: 0.1670909949198595, 1.20947030782854
clus 17: 0.1726345774801811, 1.175631765058812
clus 18: 0.18195178382718852, 1.1385213864972539
clus 19: 0.17172818856043742, 1.0918516410460555
clus 20: 0.16776577438881243, 1.0404600794923426
clus 21: 0.15872599255240785, 1.0156284685471753
clus 22: 0.1597096796702

clus 28: 0.1576120025411879, 0.6248992103455581
clus 29: 0.1473452828841026, 0.6118456876540717
0.9981442008155497
clus 2: 0.1331121076840002, 1.9534263365076359
clus 3: 0.12658051260088826, 2.0375664402877494
clus 4: 0.13170085036247725, 1.8398891809192701
clus 5: 0.13737831498438366, 1.6913706628214111
clus 6: 0.15259203375895006, 1.5682499578551372
clus 7: 0.1681001443873215, 1.4498921031566105
clus 8: 0.17419856798784353, 1.4759001177687279
clus 9: 0.1834649911300235, 1.3706545925556413
clus 10: 0.19876450164764617, 1.3060676297655627
clus 11: 0.20074229565414525, 1.218636084886699
clus 12: 0.20527147483518488, 1.1926419645207047
clus 13: 0.21589550222830883, 1.1394508344964385
clus 14: 0.2136601523654786, 1.0952281991579258
clus 15: 0.2141745483414077, 1.0844416497798062
clus 16: 0.20994476808280835, 1.0283779351971862
clus 17: 0.20956508650447392, 0.9854024413943508
clus 18: 0.20946242583281782, 0.91139043876364
clus 19: 0.20554386473114397, 0.8704449648200869
clus 20: 0.20625129

In [104]:
def nlp_cluster_for_cluster(dir_spec, n):
    dirname = '{}/{}'.format(os.getcwd(),dir_spec)
    for filename in os.listdir(dirname):
        path = '{}/{}'.format(dirname, filename)
        labels = nlp_clus(path, n, False)
        print(labels)
        please(dirname + '/' + filename,labels,n,'woohoo2_{}'.format(filename))

In [112]:
nlp_cluster_for_cluster('./woohoo2', 18)

0.9985212170402835
clus 18: 0.1673587612356257, 0.9616231609903498
[14  2 11  2  2 11  8  8 10  0  2  3 14 10 10 12  8  2 10  6  3 12  0  3
  0  4  9  0 15  5  9 13 13  1  4  5 17  5 16  1  4  5  6  1  7  1]
0.9981113033590082
clus 18: 0.2245960660126046, 0.8268770058085149
[12 16 13 15 10 12  1 13  1  3 10  6  5  3  1  6  9  6  7  2  4  0  8  4
  2  9  5  0  7  8  7 11 11 17  2  0  5  4 14]
0.9982582121633528
clus 18: 0.14821589719817846, 1.138809437248698
[ 3 17 15 11 11 17  1 12  6  1  1  6 15  7  5 11  3  4  0  7  8 15  3  7
 12  1  0  2  0  1 10  0  3  5  5  0  9  8  8 16  2  4  9  9  4 14  4 16
 13 10  2 14]
0.9979040017646877
clus 18: 0.1596679334129919, 0.9283787523823778
[12 11 10  0  0  8  0 16  1  5  5  8  0 14  1 10  1 11 13  7 17 16  2  6
  6  3  3 13 15 15  8  7  4  6  9  2  4  3  2  9]
0.9980104220922025
clus 18: 0.17562821788946123, 1.0713914464426784
[11 12  1 17  1 11 17  4  7 10 12 13 13 13  4  0  4 10  1 11 15  2  4  4
  0  3  7  1  0 14 14  2 10 13 15  2  8  9  2  

In [108]:
nlp_cluster_for_cluster_spec('./woohoo2')

0.9985212170402835
clus 2: 0.13858049565563965, 2.1936161295189107
clus 3: 0.15087070774277922, 1.7586828648931403
clus 4: 0.12243946881349771, 1.809400965643634
clus 5: 0.13574382622533232, 1.6780404374681506
clus 6: 0.14012023904481338, 1.6453631564153346
clus 7: 0.1388077289930433, 1.625861177234928
clus 8: 0.14288131176259566, 1.5027911054809222
clus 9: 0.15262206673145318, 1.3943813217099086
clus 10: 0.1561262732316628, 1.360371701042234
clus 11: 0.16144817299148725, 1.273715017050711
clus 12: 0.17537163778756346, 1.2199640128116724
clus 13: 0.179115372525265, 1.20065026898033
clus 14: 0.18356075279875603, 1.1631831463479088
clus 15: 0.18701445882157214, 1.1325076339903102
clus 16: 0.1799155379245125, 1.0806424377451105
clus 17: 0.16842986934565896, 1.0338585820413417
clus 18: 0.1673587612356257, 0.9616231609903498
clus 19: 0.16354520289374772, 0.9194114469248651
clus 20: 0.1616684607428307, 0.8950087351938262
clus 21: 0.15846002473384774, 0.9048582626881111
clus 22: 0.15580141843

clus 15: 0.17605843728041998, 1.0746654254141328
clus 16: 0.17700735643443177, 1.0456554175208326
clus 17: 0.17600874666330432, 1.0237864451609777
clus 18: 0.17628125780901366, 0.9745863828167081
clus 19: 0.17432860973134218, 0.9743116434036269
clus 20: 0.16745464021267198, 0.9439466647925272
clus 21: 0.16482447780233508, 0.9356984139580968
clus 22: 0.16832221932493918, 0.9299299987194839
clus 23: 0.16178254222920552, 0.9024901071319817
clus 24: 0.15567417972649655, 0.851871841650972
clus 25: 0.14985203913024853, 0.8245177063913846
clus 26: 0.14430402682074997, 0.7699048252956077
clus 27: 0.13729143500172708, 0.7318953345808223
clus 28: 0.12825950214822662, 0.6969154588020572
clus 29: 0.11923356580669509, 0.6722571517883678
0.9983748341350666
clus 2: 0.11408908515241749, 2.288556134649239
clus 3: 0.12022012284017967, 2.0478873445836023
clus 4: 0.09495813892753023, 1.9779710252233675
clus 5: 0.10630740382944338, 1.7995815390601984
clus 6: 0.11859619236098916, 1.7655465671911454
clus 7: 