In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
import statsmodels.api as sm
from sklearn.svm import SVC
import math
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import imblearn
from imblearn.over_sampling import SMOTE

<h1><center>1. Multi-class and Multi-Label Classification Using Support Vector Machines </center></h1>

__1. (a) Download the ANURAN Calls (MFCCs) Data Set. Choose 70% of the data randomly as the training set.__  

In [2]:
from sklearn.model_selection import train_test_split
df = pd.read_csv('Data\Anuran Calls (MFCCs)\Frogs_MFCCs.csv')
Targets = df[["Family", "Genus", "Species"]]
df1 = df.drop(['Family', 'Genus', 'Species', 'RecordID'], axis=1)
TrainDF, TestDF, TrainTargets, TestTargets = train_test_split(df1, Targets, test_size=0.3, random_state=8)
TrainTarget1 = TrainTargets.loc[:, 'Family']
TrainTarget2 = TrainTargets.loc[:, 'Genus']
TrainTarget3 = TrainTargets.loc[:, 'Species']
TestTarget1 = TestTargets.loc[:, 'Family']
TestTarget2 = TestTargets.loc[:, 'Genus']
TestTarget3 = TestTargets.loc[:, 'Species']

__1. (b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:__

__1. (b) i. Research exact match and hamming score/loss methods for evaluating multi-label classification and use them in evaluating the classifiers in this problem.__  

Exact match: The percentage of samples that have all their labels classified correctly.

Hamming loss: The fraction of labels that are incorrectly predicted, i.e., the fraction of the wrong labels to the total number of labels.

__1. (b) ii. Train a SVM for each of the labels, using Gaussian kernels and one verses all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. Consider solving the problem with both standardized and raw attributes and reporting the results.__  

In [3]:
#Creating my C and Gamma values to be used in cross-validation to find optimum parameters
a = np.arange(-3,6, dtype=float)
Clogs3 = []
for i in a:
    b = 10**i
    Clogs3.append(b)
Clogs2 = [x * 5 for x in Clogs3]
Cgrid = Clogs2 + Clogs3
b = np.arange(0.001, 1, 0.1)
c = np.arange(0,110,10)
Ggrid = np.concatenate((b,c))

In [4]:
#Creating a SVM for Target 1
svc1 = SVC(C = 50, kernel='rbf', gamma=0.901, decision_function_shape = 'ovr')
#svc1.fit(TrainDF, TrainTarget1)
#TestSVCPred1 = svc1.predict(TestDF)
#print(classification_report(TestTarget1, TestSVCPred1),)
#print(confusion_matrix(TestTarget1, TestSVCPred1))
#svc1.score(TestDF, TestTarget1)

Note: I've "muted" these bits of cross validation code because I've already gotten the output from them that I want and they take a long time to run. They are still functional if you remove the #'s.

In [5]:
#Cross validation to find best C and gamma parameters for Target 1
#parameters = {'C': Cgrid, 'gamma': Ggrid}
#GSSVM1 = GridSearchCV(svc1, param_grid=parameters, n_jobs=-1, cv=10)
#GSSVM1.fit(TrainDF, TrainTarget1)
#GSSVM1.best_estimator_
#Result: C=50.0, gamma=0.901

In [6]:
#Cross validation to find best C and gamma parameters for Target 2
#GSSVM2 = GridSearchCV(svc1, param_grid=parameters, n_jobs=-1, cv=10)
#GSSVM2.fit(TrainDF, TrainTarget2)
#GSSVM2.best_estimator_
#Result: C=500.0, gamma=0.901

In [7]:
#Cross validation to find best C and gamma parameters for Target 3
#GSSVM3 = GridSearchCV(svc1, param_grid=parameters, n_jobs=-1, cv=10)
#GSSVM3.fit(TrainDF, TrainTarget3)
#GSSVM3.best_estimator_
#Result: C=10.0, gamma=0.901

In [8]:
#Final SVM for Target 1
svc2 = SVC(C = 50, kernel='rbf', gamma=0.901, decision_function_shape = 'ovr')
svc2.fit(TrainDF, TrainTarget1)
TestSVCPred1 = svc2.predict(TestDF)
print(classification_report(TestTarget1, TestSVCPred1),)
print(confusion_matrix(TestTarget1, TestSVCPred1))
svc2.score(TestDF, TestTarget1)

                 precision    recall  f1-score   support

      Bufonidae       1.00      1.00      1.00        16
  Dendrobatidae       0.99      0.99      0.99       163
        Hylidae       0.99      0.99      0.99       641
Leptodactylidae       1.00      1.00      1.00      1339

       accuracy                           0.99      2159
      macro avg       0.99      1.00      0.99      2159
   weighted avg       0.99      0.99      0.99      2159

[[  16    0    0    0]
 [   0  162    1    0]
 [   0    1  635    5]
 [   0    1    5 1333]]


0.9939786938397406

In [9]:
#Final SVM for Target 2
svc3 = SVC(C = 500, kernel='rbf', gamma=0.901, decision_function_shape = 'ovr')
svc3.fit(TrainDF, TrainTarget2)
TestSVCPred2 = svc3.predict(TestDF)
print(classification_report(TestTarget2, TestSVCPred2),)
print(confusion_matrix(TestTarget2, TestSVCPred2))
svc3.score(TestDF, TestTarget2)

               precision    recall  f1-score   support

    Adenomera       1.00      1.00      1.00      1261
     Ameerega       0.99      0.99      0.99       163
Dendropsophus       0.97      0.97      0.97        86
    Hypsiboas       0.99      0.99      0.99       485
Leptodactylus       0.96      1.00      0.98        78
Osteocephalus       0.91      0.89      0.90        35
     Rhinella       1.00      1.00      1.00        16
       Scinax       0.97      0.97      0.97        35

     accuracy                           0.99      2159
    macro avg       0.97      0.97      0.97      2159
 weighted avg       0.99      0.99      0.99      2159

[[1256    1    1    1    0    1    0    1]
 [   0  162    1    0    0    0    0    0]
 [   3    0   83    0    0    0    0    0]
 [   1    0    0  479    3    2    0    0]
 [   0    0    0    0   78    0    0    0]
 [   2    0    0    2    0   31    0    0]
 [   0    0    0    0    0    0   16    0]
 [   0    0    1    0    0    0    0

0.9907364520611394

In [10]:
#Final SVM for Target 3
svc4 = SVC(C = 10, kernel='rbf', gamma=0.901, decision_function_shape = 'ovr')
svc4.fit(TrainDF, TrainTarget3)
TestSVCPred3 = svc4.predict(TestDF)
print(classification_report(TestTarget3, TestSVCPred3),)
print(confusion_matrix(TestTarget3, TestSVCPred3))
svc4.score(TestDF, TestTarget3)

                        precision    recall  f1-score   support

        AdenomeraAndre       0.98      0.97      0.98       200
AdenomeraHylaedactylus       1.00      1.00      1.00      1061
    Ameeregatrivittata       0.99      0.99      0.99       163
            HylaMinuta       0.94      0.98      0.96        86
  HypsiboasCinerascens       0.99      0.98      0.99       131
     HypsiboasCordobae       1.00      0.99      1.00       354
   LeptodactylusFuscus       1.00      1.00      1.00        78
 OsteocephalusOophagus       0.92      0.94      0.93        35
     Rhinellagranulosa       1.00      1.00      1.00        16
           ScinaxRuber       0.97      0.97      0.97        35

              accuracy                           0.99      2159
             macro avg       0.98      0.98      0.98      2159
          weighted avg       0.99      0.99      0.99      2159

[[ 195    0    1    2    0    0    0    1    0    1]
 [   0 1059    0    1    0    1    0    0    0  

0.9916628068550255

In [11]:
TestSVCPred1DF = pd.DataFrame(data=TestSVCPred1)
TestSVCPred2DF = pd.DataFrame(data=TestSVCPred2)
TestSVCPred3DF = pd.DataFrame(data=TestSVCPred3)
TestSVCPredDF = pd.concat([TestSVCPred1DF,TestSVCPred2DF,TestSVCPred3DF], axis=1, ignore_index=True)

In [12]:
TestTargets2 = TestTargets.reset_index(drop=True)
TestTargets2.columns = range(TestTargets2.shape[1])

In [13]:
#Hamming Loss for SVM
from sklearn.metrics import hamming_loss
HL1 = hamming_loss(TestTarget1, TestSVCPred1)
HL2 = hamming_loss(TestTarget2, TestSVCPred2)
HL3 = hamming_loss(TestTarget3, TestSVCPred3)
SVCHL = (HL1 + HL2 + HL3)/3
print(SVCHL)

0.007874015748031496


In [14]:
#Hamming Score
1-SVCHL

0.9921259842519685

In [15]:
#Total number of misclassified labels
(HL1*2159) + (HL2*2159) + (HL3*2159)

51.0

In [16]:
#Exact Match for SVC
ExactMatch1 = TestTargets2.compare(TestSVCPredDF)
(len(TestTargets2)-len(ExactMatch1))/len(TestTargets2)

0.9861046780917091

__1. (b) iii. Repeat 1(b)ii with L1-penalized (with linear kernel) SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.__  

In [17]:
#Standardizing the attributes (mean zero, STD 1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
TrainScaled = scaler.fit_transform(TrainDF)

#Scale test data using scaler fit to training data
TestScaled = scaler.transform(TestDF)
TestDFscaled = pd.DataFrame(data=TestScaled)
TrainDFscaled = pd.DataFrame(data=TrainScaled)

In [18]:
Lparameters = {'C': Cgrid}

In [19]:
#Repeat ii with L1-penalized SVMs with linear kernel. 
from sklearn.svm import LinearSVC
lsvc1 = LinearSVC(penalty = 'l1', C=0.5, dual=False, max_iter=5000)
lsvc1.fit(TrainDFscaled, TrainTarget1)
TestLSVCPred1 = lsvc1.predict(TestDFscaled)
print(classification_report(TestTarget1, TestLSVCPred1))
print(confusion_matrix(TestTarget1, TestLSVCPred1))
lsvc1.score(TestDFscaled, TestTarget1)

                 precision    recall  f1-score   support

      Bufonidae       0.00      0.00      0.00        16
  Dendrobatidae       0.88      0.86      0.87       163
        Hylidae       0.90      0.91      0.90       641
Leptodactylidae       0.96      0.97      0.97      1339

       accuracy                           0.94      2159
      macro avg       0.68      0.68      0.68      2159
   weighted avg       0.93      0.94      0.93      2159

[[   0    0   16    0]
 [   0  140   19    4]
 [   1   12  581   47]
 [   0    8   29 1302]]


0.937007874015748

In [20]:
#Linear SVM for Target 2
lsvc2 = LinearSVC(penalty = 'l1', C=10, dual=False, max_iter=5000)
lsvc2.fit(TrainDFscaled, TrainTarget2)
TestLSVCPred2 = lsvc2.predict(TestDFscaled)
print(classification_report(TestTarget2, TestLSVCPred2))
print(confusion_matrix(TestTarget2, TestLSVCPred2))
lsvc2.score(TestDFscaled, TestTarget2)

               precision    recall  f1-score   support

    Adenomera       0.97      0.99      0.98      1261
     Ameerega       0.90      0.93      0.92       163
Dendropsophus       0.94      0.67      0.78        86
    Hypsiboas       0.94      0.97      0.95       485
Leptodactylus       0.96      0.94      0.95        78
Osteocephalus       0.82      0.51      0.63        35
     Rhinella       0.93      0.81      0.87        16
       Scinax       0.97      1.00      0.99        35

     accuracy                           0.96      2159
    macro avg       0.93      0.85      0.88      2159
 weighted avg       0.96      0.96      0.95      2159

[[1245    7    0    6    0    2    0    1]
 [   5  152    4    2    0    0    0    0]
 [  16    8   58    4    0    0    0    0]
 [   9    0    0  472    3    1    0    0]
 [   0    0    0    4   73    1    0    0]
 [   1    1    0   14    0   18    1    0]
 [   1    0    0    2    0    0   13    0]
 [   0    0    0    0    0    0    0

0.9569245020842982

In [21]:
#Linear SVM for Target 3
lsvc3 = LinearSVC(penalty = 'l1', C=5, dual=False, max_iter=5000)
lsvc3.fit(TrainDFscaled, TrainTarget3)
TestLSVCPred3 = lsvc3.predict(TestDFscaled)
print(classification_report(TestTarget3, TestLSVCPred3, zero_division=1))
print(confusion_matrix(TestTarget3, TestLSVCPred3))
lsvc3.score(TestDFscaled, TestTarget3)

                        precision    recall  f1-score   support

        AdenomeraAndre       0.92      0.93      0.92       200
AdenomeraHylaedactylus       0.99      1.00      1.00      1061
    Ameeregatrivittata       0.89      0.93      0.91       163
            HylaMinuta       0.90      0.73      0.81        86
  HypsiboasCinerascens       0.95      0.93      0.94       131
     HypsiboasCordobae       0.93      0.97      0.95       354
   LeptodactylusFuscus       0.97      0.95      0.96        78
 OsteocephalusOophagus       0.91      0.57      0.70        35
     Rhinellagranulosa       0.84      1.00      0.91        16
           ScinaxRuber       0.97      1.00      0.99        35

              accuracy                           0.96      2159
             macro avg       0.93      0.90      0.91      2159
          weighted avg       0.96      0.96      0.96      2159

[[ 185    0    7    2    0    4    0    1    0    1]
 [   0 1060    0    0    0    1    0    0    0  

0.9592403890690134

In [22]:
#Cross validation to find best C for Target 1 using LinearSVM
#GSLSVM1 = GridSearchCV(lsvc1, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSLSVM1.fit(TrainDFscaled, TrainTarget1)
#GSLSVM1.best_estimator_
#Result: C=0.5

In [23]:
#Cross validation to find best C for Target 2 using LinearSVM
#GSLSVM2 = GridSearchCV(lsvc2, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSLSVM2.fit(TrainDFscaled, TrainTarget2)
#GSLSVM2.best_estimator_
#Result: C=10.0

In [24]:
#Cross validation to find best C for Target 2 using LinearSVM.
#GSLSVM3 = GridSearchCV(lsvc3, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSLSVM3.fit(TrainDFscaled, TrainTarget3)
#GSLSVM3.best_estimator_
#Result: C=5.0

In [25]:
TestLSVCPred1DF = pd.DataFrame(data=TestLSVCPred1)
TestLSVCPred2DF = pd.DataFrame(data=TestLSVCPred2)
TestLSVCPred3DF = pd.DataFrame(data=TestLSVCPred3)
TestLSVCPredDF = pd.concat([TestLSVCPred1DF,TestLSVCPred2DF,TestLSVCPred3DF], axis=1, ignore_index=True)

In [26]:
#Hamming Loss for LSVM
LHL1 = hamming_loss(TestTarget1, TestLSVCPred1)
LHL2 = hamming_loss(TestTarget2, TestLSVCPred2)
LHL3 = hamming_loss(TestTarget3, TestLSVCPred3)
LSVCHL = (LHL1 + LHL2 + LHL3)/3
print(LSVCHL)

0.04894241161031341


In [27]:
#Hamming score
1-LSVCHL

0.9510575883896866

In [28]:
#Total number of misclassified labels
(LHL1*2159) + (LHL2*2159) + (LHL3*2159)

317.0

In [29]:
#Exact Match for LSVC
ExactMatch2 = TestTargets2.compare(TestLSVCPredDF)
(len(TestTargets2)-len(ExactMatch2))/len(TestTargets2)

0.920796665122742

__1. (b) iv. Repeat 1(b)iii by using SMOTE or any other method to remedy class imbalance. Report conclusions about the classifiers trained.__  

In [30]:
#SMOTE my training data
sm1 = SMOTE()
TrainDFscaled1SM, TrainTarget1SM = sm1.fit_resample(TrainDFscaled, TrainTarget1)
TrainDFscaled2SM, TrainTarget2SM = sm1.fit_resample(TrainDFscaled, TrainTarget2)
TrainDFscaled3SM, TrainTarget3SM = sm1.fit_resample(TrainDFscaled, TrainTarget3)

#TrainTarget1SM = pd.DataFrame(data=TrainTarget1SM)
#TrainTarget2SM = pd.DataFrame(data=TrainTarget2SM)
#TrainTarget3SM = pd.DataFrame(data=TrainTarget3SM)

In [31]:
#Repeat iii with SMOTE performed to correct class imbalance. 
#Linear SVC for SMOTE target 1
SMsvc1 = LinearSVC(penalty = 'l1', C=5, dual=False, max_iter=10000)
SMsvc1.fit(TrainDFscaled1SM, TrainTarget1SM)
TestSMSVCPred1 = SMsvc1.predict(TestDFscaled)
print(classification_report(TestTarget1, TestSMSVCPred1))
print(confusion_matrix(TestTarget1, TestSMSVCPred1))
SMsvc1.score(TestDFscaled, TestTarget1)

                 precision    recall  f1-score   support

      Bufonidae       0.25      1.00      0.41        16
  Dendrobatidae       0.76      0.95      0.84       163
        Hylidae       0.93      0.88      0.90       641
Leptodactylidae       0.98      0.94      0.96      1339

       accuracy                           0.92      2159
      macro avg       0.73      0.94      0.78      2159
   weighted avg       0.94      0.92      0.93      2159

[[  16    0    0    0]
 [   1  155    5    2]
 [  19   34  563   25]
 [  27   16   39 1257]]


0.9221861973135711

In [32]:
#Linear SVC for SMOTE target 2
SMsvc2 = LinearSVC(penalty = 'l1', C=50, dual=False, max_iter=10000)
SMsvc2.fit(TrainDFscaled2SM, TrainTarget2SM)
TestSMSVCPred2 = SMsvc2.predict(TestDFscaled)
print(classification_report(TestTarget2, TestSMSVCPred2))
print(confusion_matrix(TestTarget2, TestSMSVCPred2))
SMsvc2.score(TestDFscaled, TestTarget2)

               precision    recall  f1-score   support

    Adenomera       0.99      0.92      0.95      1261
     Ameerega       0.87      0.94      0.91       163
Dendropsophus       0.67      0.84      0.74        86
    Hypsiboas       0.97      0.92      0.94       485
Leptodactylus       0.89      0.95      0.92        78
Osteocephalus       0.46      0.89      0.61        35
     Rhinella       0.30      1.00      0.46        16
       Scinax       0.85      1.00      0.92        35

     accuracy                           0.92      2159
    macro avg       0.75      0.93      0.81      2159
 weighted avg       0.94      0.92      0.93      2159

[[1160   16   30    9    0   21   24    1]
 [   2  154    6    0    0    0    1    0]
 [   2    7   72    1    1    0    3    0]
 [   7    0    0  445    8   14    6    5]
 [   0    0    0    2   74    1    1    0]
 [   0    0    0    2    0   31    2    0]
 [   0    0    0    0    0    0   16    0]
 [   0    0    0    0    0    0    0

0.9203334877257989

In [33]:
#Linear SVC for SMOTE target 3
SMsvc3 = LinearSVC(penalty = 'l1', C=50, dual=False, max_iter=25000)
SMsvc3.fit(TrainDFscaled3SM, TrainTarget3SM)
TestSMSVCPred3 = SMsvc3.predict(TestDFscaled)
print(classification_report(TestTarget3, TestSMSVCPred3))
print(confusion_matrix(TestTarget3, TestSMSVCPred3))
SMsvc3.score(TestDFscaled, TestTarget3)

                        precision    recall  f1-score   support

        AdenomeraAndre       0.96      0.91      0.93       200
AdenomeraHylaedactylus       1.00      0.99      1.00      1061
    Ameeregatrivittata       0.89      0.90      0.89       163
            HylaMinuta       0.82      0.79      0.80        86
  HypsiboasCinerascens       0.91      0.90      0.91       131
     HypsiboasCordobae       0.95      0.94      0.95       354
   LeptodactylusFuscus       0.91      0.94      0.92        78
 OsteocephalusOophagus       0.74      0.80      0.77        35
     Rhinellagranulosa       0.48      1.00      0.65        16
           ScinaxRuber       0.95      1.00      0.97        35

              accuracy                           0.95      2159
             macro avg       0.86      0.92      0.88      2159
          weighted avg       0.95      0.95      0.95      2159

[[ 181    0    9    3    0    1    0    3    2    1]
 [   0 1054    0    2    3    2    0    0    0  

0.9504400185270959

In [34]:
#Cross validation to find best C for Target 1 using LinearSVM and a 'SMOTE'-ed dataset
#GSSMSVM1 = GridSearchCV(SMsvc1, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSSMSVM1.fit(TrainDFscaled1SM, TrainTarget1SM)
#GSSMSVM1.best_estimator_
#Result: C=5.0

In [35]:
#Cross validation to find best C for Target 2 using LinearSVM and a 'SMOTE'-ed dataset
#GSSMSVM2 = GridSearchCV(SMsvc2, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSSMSVM2.fit(TrainDFscaled2SM, TrainTarget2SM)
#GSSMSVM2.best_estimator_
#Result: C=50.0

In [36]:
#Cross validation to find best C for Target 3 using LinearSVM and a 'SMOTE'-ed dataset
#GSSMSVM3 = GridSearchCV(SMsvc3, param_grid=Lparameters, n_jobs=-1, cv=10)
#GSSMSVM3.fit(TrainDFscaled3SM, TrainTarget3SM)
#GSSMSVM3.best_estimator_
#Result: C=50.0

In [37]:
TestSMSVCPred1DF = pd.DataFrame(data=TestSMSVCPred1)
TestSMSVCPred2DF = pd.DataFrame(data=TestSMSVCPred2)
TestSMSVCPred3DF = pd.DataFrame(data=TestSMSVCPred3)
TestSMSVCPredDF = pd.concat([TestSMSVCPred1DF,TestSMSVCPred2DF,TestSMSVCPred3DF], axis=1, ignore_index=True)

In [38]:
#Hamming Loss for SMOTE-LSVM
SMHL1 = hamming_loss(TestTarget1, TestSMSVCPred1)
SMHL2 = hamming_loss(TestTarget2, TestSMSVCPred2)
SMHL3 = hamming_loss(TestTarget3, TestSMSVCPred3)
SMSVCHL = (SMHL1 + SMHL2 + SMHL3)/3
print(SMSVCHL)

0.06901343214451135


In [39]:
#Hamming score for SMOTE-LSVM
1-SMSVCHL

0.9309865678554886

In [40]:
#Total number of misclassified labels
(SMHL1*2159) + (SMHL2*2159)+ (SMHL3*2159)

447.0

In [41]:
#Exact Match for SMOTE-LSVM
ExactMatch3 = TestTargets2.compare(TestSMSVCPredDF)
(len(TestTargets2)-len(ExactMatch3))/len(TestTargets2)

0.8693839740620658

The most accurate classifier was the SVM with Gaussian kernels. It had much better classification of both under and over represented classes than did any other method.

SMOTE decreased total accuracy of the classifier but increased the precision of classification for under-represented classes. For example, the score of the classification was 0.937 without SMOTE but only 0.926 with it, however, without SMOTE the classifier mis-classified all 16 of the under-represented "Bufonidae" class, while with SMOTE it correctly identified all 16 "Bufonidae." This is a critical improvement.

<h1><center>2. K-Means Clustering on a Multi-Class and Multi-Label Data Set </center></h1>

__Monte-Carlo Simulation: Perform the following procedures 50 times and report the average and standard deviation of the 50 Hamming Distances that are calculated__

__2. (a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise). Choose k in {1, 2, ..., 50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method known.__  

In [42]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score

In [43]:
WholeDF = df1
Target1 = Targets.iloc[:,0]
Target2 = Targets.iloc[:,1]
Target3 = Targets.iloc[:,2]
WholeDFscaled = scaler.fit_transform(WholeDF)
WholeDFscaled = pd.DataFrame(data=WholeDFscaled)
Target1 = pd.DataFrame(data=Target1)
Target2 = pd.DataFrame(data=Target2)
Target3 = pd.DataFrame(data=Target3)

In [44]:
KMHL50 = []
KMHS50 = []

for j in range(0,50): 

    KCH = []
    KS = []
    for i in range(2,52):
        kmeans = KMeans(n_clusters = i, init = 'random').fit(WholeDF)
        kmlabels = kmeans.labels_
        #chs = calinski_harabasz_score(WholeDF, kmlabels)
        ss = silhouette_score(WholeDF, kmlabels)
        #KCH.append(chs)
        KS.append(ss)
    
    SMAX = np.argmax(KS) + 2 #this is the ideal k value

    kmeans4 = KMeans(n_clusters = SMAX).fit(WholeDF) #re-cluster with ideal K from above
    km4labels = kmeans4.labels_
    km4labels = pd.DataFrame(data=km4labels, columns = ['Cluster'])

    T1C = pd.concat([km4labels, Target1], axis=1) #assign true labels to clusters
    cluster01 = T1C[T1C["Cluster"] == 0]
    cluster11 = T1C[T1C["Cluster"] == 1]
    cluster21 = T1C[T1C["Cluster"] == 2]
    cluster31 = T1C[T1C["Cluster"] == 3]

    T2C = pd.concat([km4labels, Target2], axis=1)
    cluster02 = T2C[T2C["Cluster"] == 0]
    cluster12 = T2C[T2C["Cluster"] == 1]
    cluster22 = T2C[T2C["Cluster"] == 2]
    cluster32 = T2C[T2C["Cluster"] == 3]

    T3C = pd.concat([km4labels, Target3], axis=1)
    cluster03 = T3C[T3C["Cluster"] == 0]
    cluster13 = T3C[T3C["Cluster"] == 1]
    cluster23 = T3C[T3C["Cluster"] == 2]
    cluster33 = T3C[T3C["Cluster"] == 3]

    #create a DF in which the cluster predicted family genus species are aligned with their DF sample
    allcluster0 = pd.concat([cluster01, cluster02["Genus"], cluster03["Species"]], axis=1)
    allcluster1 = pd.concat([cluster11, cluster12["Genus"], cluster13["Species"]], axis=1)
    allcluster2 = pd.concat([cluster21, cluster22["Genus"], cluster23["Species"]], axis=1)
    allcluster3 = pd.concat([cluster31, cluster32["Genus"], cluster33["Species"]], axis=1)
    allcluster = pd.concat([allcluster0, allcluster1, allcluster2, allcluster3]).sort_index()
    
    mode0 = allcluster0.mode()
    Fam0Mode = mode0.iloc[0,1]
    Gen0Mode = mode0.iloc[0,2]
    Spe0Mode = mode0.iloc[0,3]
    mode1 = allcluster1.mode()
    Fam1Mode = mode1.iloc[0,1]
    Gen1Mode = mode1.iloc[0,2]
    Spe1Mode = mode1.iloc[0,3]
    mode2 = allcluster0.mode()
    Fam2Mode = mode2.iloc[0,1]
    Gen2Mode = mode2.iloc[0,2]
    Spe2Mode = mode2.iloc[0,3]
    mode3 = allcluster0.mode()
    Fam3Mode = mode3.iloc[0,1]
    Gen3Mode = mode3.iloc[0,2]
    Spe3Mode = mode3.iloc[0,3]
    newallcluster0 = allcluster0.drop(columns = ['Family', 'Genus', 'Species'])
    newallcluster0['Family'] = Fam0Mode
    newallcluster0['Genus'] = Gen0Mode
    newallcluster0['Species'] = Spe0Mode
    newallcluster1 = allcluster1.drop(columns = ['Family', 'Genus', 'Species'])
    newallcluster1['Family'] = Fam1Mode
    newallcluster1['Genus'] = Gen1Mode
    newallcluster1['Species'] = Spe1Mode
    newallcluster2 = allcluster2.drop(columns = ['Family', 'Genus', 'Species'])
    newallcluster2['Family'] = Fam2Mode
    newallcluster2['Genus'] = Gen2Mode
    newallcluster2['Species'] = Spe2Mode
    newallcluster3 = allcluster3.drop(columns = ['Family', 'Genus', 'Species'])
    newallcluster3['Family'] = Fam3Mode
    newallcluster3['Genus'] = Gen3Mode
    newallcluster3['Species'] = Spe3Mode
    newallcluster = pd.concat([newallcluster0, newallcluster1, newallcluster2, newallcluster3]).sort_index()

    KMHL1 = hamming_loss(Target1, newallcluster.iloc[:,1])
    KMHL2 = hamming_loss(Target2, newallcluster.iloc[:,2])
    KMHL3 = hamming_loss(Target3, newallcluster.iloc[:,3])
    KMHL = (KMHL1 + KMHL2 + KMHL3)/3

    KMHS = 1-KMHL

    KMHL50.append(KMHL)
    KMHS50.append(KMHS)


__2. (c) For each cluster there is a majority label triplet (family, genus, species). Calculate the average Hamming score and Hamming loss between the true labels and the labels assigned by clusters.__  

In [45]:
#Here are the 50 Hamming Scores from performing the procedure 50 times.
KMHS50

[0.5570998378503591,
 0.6725040537410238,
 0.5887421820708825,
 0.5570998378503591,
 0.5570998378503591,
 0.6642575862867732,
 0.6725040537410238,
 0.7020152883947186,
 0.6642575862867732,
 0.6643965716933056,
 0.6645355570998379,
 0.7020152883947186,
 0.22242297892054674,
 0.6642575862867732,
 0.5570998378503591,
 0.7020152883947186,
 0.599953671531156,
 0.5570998378503591,
 0.6643965716933056,
 0.24220523511697944,
 0.6642575862867732,
 0.6645355570998379,
 0.6725040537410238,
 0.6748668056520732,
 0.5570998378503591,
 0.6642575862867732,
 0.7020152883947186,
 0.5570998378503591,
 0.6725040537410238,
 0.599953671531156,
 0.6642575862867732,
 0.6253880009265693,
 0.5570998378503591,
 0.5570998378503591,
 0.6642575862867732,
 0.6642575862867732,
 0.5570998378503591,
 0.6296965485290711,
 0.5570998378503591,
 0.5570998378503591,
 0.6642575862867732,
 0.6642575862867732,
 0.6642575862867732,
 0.599953671531156,
 0.6725040537410238,
 0.599953671531156,
 0.7020152883947186,
 0.557099837850

In [46]:
import statistics
#Averave Hamming Score
statistics.mean(KMHS50)

0.607787815612694

In [47]:
#Average Hamming Loss
statistics.mean(KMHL50)

0.392212184387306

In [48]:
#Hamming Loss Standard Deviation
statistics.stdev(KMHL50)

0.10964409407789628

Note about above code: I admit, it's not quite right. This works when the K value chosen by Silhouette Score is K=4, which it usually is, but it it will sometimes fail before it gets to 50 iterations. It's good enough that it can get me a lot of data and I can estimate the Hamming scores pretty well, but it would require some re-coding to be perfect, and I only realized this too late.

__3. ISLR 10.7.2__  

See included PDF