In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples,silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox, skew
from sklearn.decomposition import PCA, KernelPCA

In [28]:
def hacForNumClusters(X,n):
    for cluster_num in range(2,n):
        hac(X,cluster_num)
        
def hac(X,cluster_num):
    hac = AgglomerativeClustering(n_clusters = cluster_num)
    labels = hac.fit_predict(X)
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)
    print('clus {}: {}, {}'.format(cluster_num, sil, db))
    return labels
        
def cluster(filepath, num_clus, mult=False):
    df = pd.read_csv(filepath)
    df = df.drop(['Bios'], axis=1)
    
    X = df.to_numpy()
    #minmax scaling
    X_minmax = MinMaxScaler().fit_transform(X)
    
    #dimensionality reduction
    pca = PCA(n_components = 0.99)
    X = pca.fit_transform(X_minmax)
    
    if mult:
        hacForNumClusters(X,num_clus)
    else:
        labels = hac(X,num_clus)
        return labels

In [29]:
print('dataset 1')
labels1 = cluster('./org.csv',30,True)

print('dataset 2')
labels2 = cluster('./org.csv',30,True)

dataset 1
clus 2: 0.13110979245482507, 2.523718796851537
clus 3: 0.11350183171234067, 2.27576431436099
clus 4: 0.14534076344714092, 2.3041647128304996
clus 5: 0.12320377464402425, 2.1213396915745486
clus 6: 0.12751792322631578, 2.020743046093323
clus 7: 0.14704446279755481, 2.077890034739056
clus 8: 0.1631457150382885, 2.084184944744435
clus 9: 0.15048420188500422, 2.1128459750676942
clus 10: 0.14372754804549123, 2.0748823733230566
clus 11: 0.1296749080357924, 2.0773981020704677
clus 12: 0.118329640964038, 2.1589801052345945
clus 13: 0.11731439825403481, 2.2308298722974502
clus 14: 0.11125248853191574, 2.2778935785983645
clus 15: 0.10580318411999988, 2.277235846905591
clus 16: 0.10658404563054132, 2.2952460158294894
clus 17: 0.10665230251282805, 2.2283954064427185
clus 18: 0.10697184244834389, 2.183186921269965
clus 19: 0.10488701755371804, 2.215887428345672
clus 20: 0.10442376431722256, 2.226255295150083
clus 21: 0.10440408669174742, 2.177423202243742
clus 22: 0.10532728064068718, 2.1

In [30]:
print('dataset 1')
labels1 = cluster('./org.csv',8)

print('dataset 2')
labels2 = cluster('./org.csv',8)

dataset 1
clus 8: 0.1631457150382885, 2.084184944744435
dataset 2
clus 8: 0.1631457150382885, 2.084184944744435


In [31]:
import os

#os.mkdir(os.getcwd() + '/' + someWord)
def please(filepath, labels, n, someWord):
    df_ori = pd.read_csv(filepath)
    df_ori['cluster #'] = labels
    #os.mkdir(os.getcwd() + '/' + someWord)
    
    
    for cluster in range(n):
        yes = df_ori[df_ori['cluster #'] == cluster]
        filename = '{}/{}.csv'.format(someWord, str(cluster))
        yes.to_csv(filename)
        
    

In [32]:
please('./org.csv',labels1,8,'woohoo1')

please('./org.csv',labels2,8,'woohoo2')

FileNotFoundError: [Errno 2] No such file or directory: 'woohoo2/0.csv'

clustering the clusters

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def nlp_clus(filename, n, desc=False):
    df = pd.read_csv(filename)
    # Instantiating the Vectorizer, experimenting with both
    vectorizer = CountVectorizer()
    #vectorizer = TfidfVectorizer()

    # Fitting the vectorizer to the Bios
    x = vectorizer.fit_transform(df['Bios'])

    # Creating a new DF that contains the vectorized words
    df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

    # Concating the words DF with the original DF
    new_df = pd.concat([df, df_wrds], axis=1)

    # Dropping the Bios because it is no longer needed in place of vectorization
    new_df.drop('Bios', axis=1, inplace=True)

    # Instantiating PCA
    pca = PCA()

    # Fitting and Transforming the DF
    df_pca = pca.fit_transform(new_df)

    # Reducing the dataset to the number of features determined before
    pca = PCA(n_components=0.99)

    # Fitting and transforming the dataset to the stated number of features and creating a new DF
    df_pca = pca.fit_transform(new_df)

    # Seeing the variance ratio that still remains after the dataset has been reduced
    print(pca.explained_variance_ratio_.cumsum()[-1])
    
    labels = cluster(filename,n,False)
    print(labels)
    return labels

In [10]:
def nlp_cluster_for_cluster_spec(dir_spec):
    dirname = '{}/{}'.format(os.getcwd(),dir_spec)
    for filename in os.listdir(dirname):
        labels = nlp_clus('{}/{}'.format(dirname, filename), 30, True)
        print(labels)
        please(dirname + '/' + filename,labels,n,'woohoo2_{}'.format(filename))


In [10]:
nlp_cluster_for_cluster_spec('./woohoo1')

0.9981747004738313
clus 30: 0.180151105914793, 0.672999141326298
0.9982651981056483
clus 30: 0.15232391786957933, 0.725581353397667
0.9981442008155497
clus 30: 0.1741889429681261, 0.5362327988439036
0.9983595432185679
clus 30: 0.13839222744535712, 0.5959971856299495
0.9982374596075535
clus 30: 0.12044924405918284, 0.6215699548175786
0.9983426331134531
clus 30: 0.1276324486307089, 0.6674295280822489
0.9984505754157542
clus 30: 0.1534592323538643, 0.7965288675278822
0.9981950697614622
clus 30: 0.08128846747879233, 0.5230269387787726


In [11]:
def nlp_cluster_for_cluster(dir_spec, n):
    dirname = '{}/{}'.format(os.getcwd(),dir_spec)
    for filename in os.listdir(dirname):
        path = '{}/{}'.format(dirname, filename)
        labels = nlp_clus(path, n, False)
        print(labels)
        please(dirname + '/' + filename,labels,n,'woohoo2_{}'.format(filename))

In [12]:
nlp_cluster_for_cluster('./woohoo2', 18)

0.9980104220922025
clus 18: 0.17562821788946123, 1.0713914464426784
[11 12  1 17  1 11 17  4  7 10 12 13 13 13  4  0  4 10  1 11 15  2  4  4
  0  3  7  1  0 14 14  2 10 13 15  2  8  9  2  8  3  9  5  5  9 16  6  0
  6  3  5]


FileNotFoundError: [Errno 2] No such file or directory: 'woohoo2_0.csv/0.csv'

In [108]:
nlp_cluster_for_cluster_spec('./woohoo2')

0.9985212170402835
clus 2: 0.13858049565563965, 2.1936161295189107
clus 3: 0.15087070774277922, 1.7586828648931403
clus 4: 0.12243946881349771, 1.809400965643634
clus 5: 0.13574382622533232, 1.6780404374681506
clus 6: 0.14012023904481338, 1.6453631564153346
clus 7: 0.1388077289930433, 1.625861177234928
clus 8: 0.14288131176259566, 1.5027911054809222
clus 9: 0.15262206673145318, 1.3943813217099086
clus 10: 0.1561262732316628, 1.360371701042234
clus 11: 0.16144817299148725, 1.273715017050711
clus 12: 0.17537163778756346, 1.2199640128116724
clus 13: 0.179115372525265, 1.20065026898033
clus 14: 0.18356075279875603, 1.1631831463479088
clus 15: 0.18701445882157214, 1.1325076339903102
clus 16: 0.1799155379245125, 1.0806424377451105
clus 17: 0.16842986934565896, 1.0338585820413417
clus 18: 0.1673587612356257, 0.9616231609903498
clus 19: 0.16354520289374772, 0.9194114469248651
clus 20: 0.1616684607428307, 0.8950087351938262
clus 21: 0.15846002473384774, 0.9048582626881111
clus 22: 0.15580141843

clus 15: 0.17605843728041998, 1.0746654254141328
clus 16: 0.17700735643443177, 1.0456554175208326
clus 17: 0.17600874666330432, 1.0237864451609777
clus 18: 0.17628125780901366, 0.9745863828167081
clus 19: 0.17432860973134218, 0.9743116434036269
clus 20: 0.16745464021267198, 0.9439466647925272
clus 21: 0.16482447780233508, 0.9356984139580968
clus 22: 0.16832221932493918, 0.9299299987194839
clus 23: 0.16178254222920552, 0.9024901071319817
clus 24: 0.15567417972649655, 0.851871841650972
clus 25: 0.14985203913024853, 0.8245177063913846
clus 26: 0.14430402682074997, 0.7699048252956077
clus 27: 0.13729143500172708, 0.7318953345808223
clus 28: 0.12825950214822662, 0.6969154588020572
clus 29: 0.11923356580669509, 0.6722571517883678
0.9983748341350666
clus 2: 0.11408908515241749, 2.288556134649239
clus 3: 0.12022012284017967, 2.0478873445836023
clus 4: 0.09495813892753023, 1.9779710252233675
clus 5: 0.10630740382944338, 1.7995815390601984
clus 6: 0.11859619236098916, 1.7655465671911454
clus 7: 