In [52]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from scipy.cluster import hierarchy
import warnings
warnings.filterwarnings('ignore')

### 1. Multi-class and Multi-Label Classification Using Support Vector Machines

### (a) Download the Anuran Calls (MFCCs) Data Set. Choose 70% of the data randomly as the training set.

In [53]:
df = pd.read_csv("../Data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv")
df = df.drop(["RecordID"],axis=1)

In [54]:
X_df = df[df.columns[0:22]]
y_df = df[df.columns[22:]]

In [55]:
X_df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.254341,0.022786,0.163320,0.012022,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,-0.145668,-0.059364,0.024206,-0.000861,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,-0.164675,-0.105600,0.030767,0.006457,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,-0.150025,-0.078615,0.024861,0.008696,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,-0.153120,-0.075320,0.022903,0.001924,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895


In [56]:
y_df

Unnamed: 0,Family,Genus,Species
0,Leptodactylidae,Adenomera,AdenomeraAndre
1,Leptodactylidae,Adenomera,AdenomeraAndre
2,Leptodactylidae,Adenomera,AdenomeraAndre
3,Leptodactylidae,Adenomera,AdenomeraAndre
4,Leptodactylidae,Adenomera,AdenomeraAndre
...,...,...,...
7190,Hylidae,Scinax,ScinaxRuber
7191,Hylidae,Scinax,ScinaxRuber
7192,Hylidae,Scinax,ScinaxRuber
7193,Hylidae,Scinax,ScinaxRuber


#### Split data into trainX, trainy, testX, testy

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=42)
print("The shape of X_train: ", X_train.shape)
print("The shape of X_test: ", X_test.shape)
print("The shape of y_train: ", y_train.shape)
print("The shape of y_test: ", y_test.shape)

The shape of X_train:  (5036, 22)
The shape of X_test:  (2159, 22)
The shape of y_train:  (5036, 3)
The shape of y_test:  (2159, 3)


### (b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:

### i. Research exact match and hamming score/ loss methods for evaluating multi-label classification and use them in evaluating the classifiers in this problem.

#### Exact Match Ratio: 

It is the percentage of sample points that have all labels classified correctly. One of the methods to calculate accuracy_score is using "metrics.accuracy_score" function of sklearn. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true. We can also create a custom function which calculates the total number of matched instances and dividing it with count of samples, we can get the exact match ratio.


#### Hamming Loss: 

It is the fraction of labels that are incorrectly predicted. One of the methods to calculate hamming_loss is using "metrics.hamming_loss" function of sklearn. We can also create a custom function which calculates the total number of miss-classified labels and dividing it with count of samples x count of labels, we can get the hamming loss.

### ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized and raw attributes and report the results.

In [73]:
def calculate_metrics(y_test, y_pred_df):
    # Exact math ratio
    exact_match_ratio = 0
    for true_label, pred_label in zip(y_test.values, y_pred_df.values):
        matched = (true_label == pred_label)
        if sum(matched) == y_test.shape[1]:
            exact_match_ratio += 1;
    exact_match_ratio /= y_test.shape[0]

    # Hamming loss
    miss_classified = 0
    for true_label, pred_label in zip(y_test.values, y_pred_df.values):
        miss_match = (true_label != pred_label)
        miss_classified += np.sum(miss_match)
    hamming_loss = miss_classified / (y_test.shape[0] * y_test.shape[1])
    
    return exact_match_ratio, hamming_loss

#### Non-standardized 

In [78]:
# Best gamma is the width of the Gaussian Kernel
# Best C is the weight of the SVM penalty

grid = {'C': np.logspace(-5, 4, 10), 'gamma': np.logspace(-5, 4, 10)}
split = StratifiedKFold(10, random_state=42, shuffle=True)

y_pred_train_df = pd.DataFrame(columns=y_train.columns)
y_pred_test_df = pd.DataFrame(columns=y_train.columns)

for col in y_train.columns:
    y_train_ = y_train[col]
    y_test_ = y_test[col]

    svc = SVC(kernel="rbf", decision_function_shape='ovr')
    grid_cv = GridSearchCV(svc, grid, cv=split, n_jobs=-1)
    grid_cv.fit(X_train, y_train_)
    best_grid_cv = grid_cv.best_estimator_

    y_pred_train = best_grid_cv.predict(X_train)
    y_pred_train_df.loc[:, col] = y_pred_train
    y_pred_test = best_grid_cv.predict(X_test)
    y_pred_test_df.loc[:, col] = y_pred_test
    
    print('\033[1m'+"\nFor label:",col+'\033[0m')
    print("\nBest gamma:", grid_cv.best_params_['gamma'])
    print("Best C:", grid_cv.best_params_['C'])
    
exact_match_ratio_train, hamming_loss_train = calculate_metrics(y_train,y_pred_train_df)    
exact_match_ratio_test, hamming_loss_test = calculate_metrics(y_test,y_pred_test_df)

print('\033[1m'+"\nFor this classifier:"'\033[0m')
print("\nTrain Exact Match ratio:", exact_match_ratio_train)
print("Test Exact Match ratio:", exact_match_ratio_test)
print("\nTrain Hamming loss:",hamming_loss_train)
print("Test Hamming loss:",hamming_loss_test)

[1m
For label: Family[0m

Best gamma: 1.0
Best C: 100.0
[1m
For label: Genus[0m

Best gamma: 1.0
Best C: 100.0
[1m
For label: Species[0m

Best gamma: 1.0
Best C: 10.0
[1m
For this classifier:[0m

Train Exact Match ratio: 0.9990071485305798
Test Exact Match ratio: 0.9861046780917091

Train Hamming loss: 0.0003309504898067249
Test Hamming loss: 0.008337193144974525


#### Standardized

In [79]:
scaler = StandardScaler()

std_X_train = scaler.fit_transform(X_train)
std_X_test = scaler.transform(X_test)

std_X_train = pd.DataFrame(data=std_X_train, columns=X_train.columns)
std_X_test = pd.DataFrame(data=std_X_test, columns=X_test.columns)

In [89]:
# Best gamma is the width of the Gaussian Kernel
# Best C is the weight of the SVM penalty

grid = {'C': np.logspace(1, 5, 5), 'gamma': np.linspace(0,1,6)}
split = StratifiedKFold(10, random_state=42, shuffle=True)
y_pred_train_df = pd.DataFrame(columns=y_train.columns)
y_pred_test_df = pd.DataFrame(columns=y_train.columns)

for col in y_train.columns:
    y_train_ = y_train[col]
    y_test_ = y_test[col]

    svc = SVC(kernel="rbf", decision_function_shape='ovr')
    grid_cv = GridSearchCV(svc, grid, cv=split, n_jobs=-1)
    grid_cv.fit(std_X_train, y_train_)
    best_grid_cv = grid_cv.best_estimator_
    
    y_pred_train = best_grid_cv.predict(std_X_train)
    y_pred_train_df.loc[:, col] = y_pred_train
    y_pred_test = best_grid_cv.predict(std_X_test)
    y_pred_test_df.loc[:, col] = y_pred_test
    
    print('\033[1m'+"\nFor label:",col+'\033[0m')
    print("\nBest gamma:", grid_cv.best_params_['gamma'])
    print("Best C:", grid_cv.best_params_['C'])
    
exact_match_ratio_train, hamming_loss_train = calculate_metrics(y_train,y_pred_train_df)    
exact_match_ratio_test, hamming_loss_test = calculate_metrics(y_test,y_pred_test_df)

print('\033[1m'+"\nFor this classifier:"'\033[0m')
print("\nTrain Exact Match ratio:", exact_match_ratio_train)
print("Test Exact Match ratio:", exact_match_ratio_test)
print("\nTrain Hamming loss:",hamming_loss_train)
print("Test Hamming loss:",hamming_loss_test)

[1m
For label: Family[0m

Best gamma: 0.2
Best C: 10.0
[1m
For label: Genus[0m

Best gamma: 0.2
Best C: 10.0
[1m
For label: Species[0m

Best gamma: 0.2
Best C: 10.0
[1m
For this classifier:[0m

Train Exact Match ratio: 1.0
Test Exact Match ratio: 0.9740620657711904

Train Hamming loss: 0.0
Test Hamming loss: 0.019144665740311873


### iii. Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [90]:
# Best C is the weight of the SVM penalty

grid = {'C': np.logspace(1, 5, 5)}
split = StratifiedKFold(10, random_state=42, shuffle=True)
y_pred_train_df = pd.DataFrame(columns=y_train.columns)
y_pred_test_df = pd.DataFrame(columns=y_train.columns)

for col in y_train.columns:
    y_train_ = y_train[col]
    y_test_ = y_test[col]

    linear_svc = LinearSVC(penalty='l1',dual=False)
    grid_cv = GridSearchCV(linear_svc, grid, cv=split, n_jobs=-1)
    grid_cv.fit(std_X_train, y_train_)
    best_grid_cv = grid_cv.best_estimator_
    
    y_pred_train = best_grid_cv.predict(std_X_train)
    y_pred_train_df.loc[:, col] = y_pred_train
    y_pred_test = best_grid_cv.predict(std_X_test)
    y_pred_test_df.loc[:, col] = y_pred_test
    
    print('\033[1m'+"\nFor label:",col+'\033[0m')
    print("Best C:", grid_cv.best_params_['C'])
    
exact_match_ratio_train, hamming_loss_train = calculate_metrics(y_train,y_pred_train_df)    
exact_match_ratio_test, hamming_loss_test = calculate_metrics(y_test,y_pred_test_df)

print('\033[1m'+"\nFor this classifier:"'\033[0m')
print("\nTrain Exact Match ratio:", exact_match_ratio_train)
print("Test Exact Match ratio:", exact_match_ratio_test)
print("\nTrain Hamming loss:",hamming_loss_train)
print("Test Hamming loss:",hamming_loss_test)

[1m
For label: Family[0m
Best C: 10.0
[1m
For label: Genus[0m
Best C: 1000.0
[1m
For label: Species[0m
Best C: 10.0
[1m
For this classifier:[0m

Train Exact Match ratio: 0.9253375694996029
Test Exact Match ratio: 0.9124594719777674

Train Hamming loss: 0.04514164680963728
Test Hamming loss: 0.05697081982399259


### iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [98]:
# Best C is the weight of the SVM penalty

grid = {'C': np.logspace(1, 5, 5)}
split = StratifiedKFold(10, random_state=42, shuffle=True)
best_classifier = {}
y_pred_train_df = pd.DataFrame(columns=y_train.columns)
y_pred_test_df = pd.DataFrame(columns=y_train.columns)

for col in y_train.columns:
    y_train_ = y_train[col]
    y_test_ = y_test[col]
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(std_X_train, y_train_)
    X_test_smote, y_test_smote = smote.fit_resample(std_X_test, y_test_)

    linear_svc = LinearSVC(penalty='l1',dual=False)
    grid_cv = GridSearchCV(linear_svc, grid, cv=split, n_jobs=-1)
    grid_cv.fit(X_train_smote, y_train_smote)
    best_grid_cv = grid_cv.best_estimator_
    
    best_classifier[col] = best_grid_cv
    
    print('\033[1m'+"\nFor label:",col+'\033[0m')
    print("Best C:", grid_cv.best_params_['C'])


for col in y_train.columns:
    y_pred_train = best_classifier[col].predict(std_X_train)
    y_pred_train_df.loc[:, col] = y_pred_train
    y_pred_test = best_classifier[col].predict(std_X_test)
    y_pred_test_df.loc[:, col] = y_pred_test
    
exact_match_ratio_train, hamming_loss_train = calculate_metrics(y_train,y_pred_train_df)    
exact_match_ratio_test, hamming_loss_test = calculate_metrics(y_test,y_pred_test_df)

print('\033[1m'+"\nFor this classifier:"'\033[0m')
print("\nTrain Exact Match ratio:", exact_match_ratio_train)
print("Test Exact Match ratio:", exact_match_ratio_test)
print("\nTrain Hamming loss:",hamming_loss_train)
print("Test Hamming loss:",hamming_loss_test)

[1m
For label: Family[0m
Best C: 10.0
[1m
For label: Genus[0m
Best C: 100.0
[1m
For label: Species[0m
Best C: 100.0
[1m
For this classifier:[0m

Train Exact Match ratio: 0.8703335980937251
Test Exact Match ratio: 0.8527095877721167

Train Hamming loss: 0.06354249404289118
Test Hamming loss: 0.07704184035819052


#### Conclusion:

From the above results of exact match ratio and hamming loss of all the classifiers, it is observed that SVM with Gaussian kernal classifier without standardization performs better than all the other classifiers for our dataset. It has the highest test exact match ratio and lowest test hamming loss.

### 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

### 2. a, b, c all parts are answered by the code in the cells below.  

In [27]:
hamming_dist = []
hamming_loss = []
for i in range(1,51):
    scores = []
    
#     (a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set 
#         (do not split the data into train and test, as we are not performing supervised learning in this exercise). 
#         Choose k belongs to {1,2,.....,50} automatically based on one of the methods provided in the slides (CH or 
#         Gap Statistics or scree plots or Silhouettes) or any other method you know.

    for k in range(2,51):
        kmeans = KMeans(n_clusters=k, random_state=i, n_jobs=-1)
        cluster_labels = kmeans.fit_predict(X_df)
        scores.append(silhouette_score(X_df,cluster_labels))
    best_k = scores.index(max(scores)) + 2

    print('\033[1m'+"\nIteration:",i,'\033[0m')
    print("\nBest K value:",best_k)
    
#     build clusterer on best k
    
#     (b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus 
#         and species

    kmeans_ = KMeans(n_clusters=best_k,random_state=i,n_jobs=-1)
    cluster_labels = kmeans_.fit_predict(X_df)
    
    most_freq_label = {}
    y_df_temp = y_df.copy()
    y_df_temp["predicted_label"] = cluster_labels
    
    miss_classified = 0
    for class_index in range(best_k):
        correct_classified = y_df_temp[y_df_temp["predicted_label"] == class_index]
        temp = dict()
        for column in y_df_temp.columns[:-1]:
            temp[column] = Counter(correct_classified[column]).most_common(1)[0][0]
            miss_classified += sum(correct_classified[column] != temp[column])
        most_freq_label[class_index] = temp
    freq_df = pd.DataFrame(most_freq_label).T
    
    print("\nDetermining the majority family, genus and species:\n")
    print(freq_df)
    
    hamming_dist_instance = miss_classified / y_df.shape[0]
    hamming_loss_instance = miss_classified / (y_df.shape[0] * y_df.shape[1])
    
    hamming_dist.append(hamming_dist_instance)
    hamming_loss.append(hamming_loss_instance)
    
    print("\nHamming distance:", hamming_dist_instance)
    print("\nHamming loss:", hamming_loss_instance)

[1m
Iteration: 1 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0          Hylidae  Hypsiboas    HypsiboasCinerascens
1    Dendrobatidae   Ameerega      Ameeregatrivittata
2  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
3          Hylidae  Hypsiboas       HypsiboasCordobae

Hamming distance: 0.66726893676164

Hamming loss: 0.2224229789205467
[1m
Iteration: 2 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0          Hylidae  Hypsiboas       HypsiboasCordobae
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
2          Hylidae  Hypsiboas       HypsiboasCordobae
3  Leptodactylidae  Adenomera          AdenomeraAndre

Hamming distance: 0.7357887421820709

Hamming loss: 0.24526291406069028
[1m
Iteration: 3 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus      

[1m
Iteration: 20 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0          Hylidae  Hypsiboas       HypsiboasCordobae
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
2          Hylidae  Hypsiboas    HypsiboasCinerascens
3  Leptodactylidae  Adenomera          AdenomeraAndre

Hamming distance: 0.7021542738012508

Hamming loss: 0.23405142460041695
[1m
Iteration: 21 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1          Hylidae  Hypsiboas       HypsiboasCordobae
2          Hylidae  Hypsiboas    HypsiboasCinerascens
3    Dendrobatidae   Ameerega      Ameeregatrivittata

Hamming distance: 0.66726893676164

Hamming loss: 0.2224229789205467
[1m
Iteration: 22 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus   

[1m
Iteration: 39 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0          Hylidae  Hypsiboas    HypsiboasCinerascens
1          Hylidae  Hypsiboas       HypsiboasCordobae
2  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
3    Dendrobatidae   Ameerega      Ameeregatrivittata

Hamming distance: 0.66726893676164

Hamming loss: 0.2224229789205467
[1m
Iteration: 40 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1          Hylidae  Hypsiboas       HypsiboasCordobae
2          Hylidae  Hypsiboas    HypsiboasCinerascens
3    Dendrobatidae   Ameerega      Ameeregatrivittata

Hamming distance: 0.66726893676164

Hamming loss: 0.2224229789205467
[1m
Iteration: 41 [0m

Best K value: 4

Determining the majority family, genus and species:

            Family      Genus      

In [28]:
# (c) Now for each cluster you have a majority label triplet (family, genus, species). Calculate the average 
# Hamming distance, Hamming score, and Hamming loss between the true labels and the labels assigned by clusters.


print('\033[1m'+"\nAverage and Standard deviation:"'\033[0m')
    
print("\nThe average Hamming Distance:",np.mean(hamming_dist))
print("The standard deviation of Hamming Distance:",np.std(hamming_dist))

print("\nThe average Hamming Loss:",np.mean(hamming_loss))
print("The standard deviation of Hamming Loss:",np.std(hamming_loss))

print("\nThe average Hamming Score:",1-np.mean(hamming_loss))
print("The standard deviation of Hamming Score:",1-np.std(hamming_loss))

[1m
Average and Standard deviation:[0m

The average Hamming Distance: 0.6712077831827656
The standard deviation of Hamming Distance: 0.031013504080750307

The average Hamming Loss: 0.22373592772758863
The standard deviation of Hamming Loss: 0.010337834693583431

The average Hamming Score: 0.7762640722724113
The standard deviation of Hamming Score: 0.9896621653064166


**Please ignore the above calculated standard deviation of Hamming Score value. The correct value is as follows:**

The standard deviation of Hamming Score: 0.010337834693583431

### References

1. https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn
2. https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
3. https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
4. https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
5. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html
6. https://machinelearningmastery.com/distance-measures-for-machine-learning/
7. https://mmuratarat.github.io/2020-01-25/multilabel_classification_metrics