## 1. Multi-class and Multi-Label Classification Using Support Vector Machines
## (a) Download the Anural Calls(MFCCs) Data Set. Choose 70% of the data randomly as the training set.

In [226]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [227]:
filepath = 'https://raw.githubusercontent.com/seongohr/ML/master/Frogs_MFCCs.csv'
df = pd.read_csv(filepath, sep=',')

In [228]:
print("row, col : ", df.shape)

row, col :  (7195, 26)


In [229]:
from sklearn.model_selection import train_test_split

x = df.loc[:,:'MFCCs_22']
y = df.loc[:,'Family':'Species']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)

In [230]:
print("x_train(row, col) : ", x_train.shape)
print("y_train(row, col) : ", y_train.shape)
print("x_test(row, col) : ", x_test.shape)
print("y_test(row, col) : ", y_test.shape)

x_train(row, col) :  (5036, 22)
y_train(row, col) :  (5036, 3)
x_test(row, col) :  (2159, 22)
y_test(row, col) :  (2159, 3)


In [6]:
#y_train

## (b) We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-class classification is to train a classifier for each label. We first try this approach:
## i. Research exact match and hamming score/loss methods for evaluating multi-label classification and use them in evaluating the classifiers in this problem.

### Answer : 
### 1) Exact Match : This calculates subset accuracy meaning the predicted set of labels should exactly match with the true set of labels.
### 2) Hamming Loss : The fraction of the wrong labels to the total number of labels.

In [150]:
#Exact Match
def getExactMatch(pred_val, true_val, table, test):
    pred_val = np.array(pred_val)
    pred_val = np.transpose(pred_val)
    result = (pred_val == np.array(true_val))
    total_res = np.where(result == 'False')
    error = len(total_res[0])
    exactMatch = 1 - (error/len(pred_val))
    if test:
        return min(table['test exact match'])
    return min(table['train exact match'])

In [242]:
#Hamming Loss
from sklearn.metrics import hamming_loss

def getHammingLoss_(pred_val, true_val, table, test):
    total_hamming = 0
    pred_val = np.array(pred_val)
    pred_val = np.transpose(pred_val)
    pred_val = pd.DataFrame(pred_val)
    for i in range(len(true_val)):
        total_hamming += hamming_loss(true_val.iloc[i], pred_val.iloc[i])
    total_hamming = total_hamming/true_val.shape[0]
    return total_hamming

## ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized and raw attributes and report the results.

In [7]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#print('before standardize : ', x_train)
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

#print('standardize : ', x_train)

In [8]:
labels = y_train.columns.values.tolist()
labels

['Family', 'Genus', 'Species']

In [80]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000], 'gamma': np.arange(0.1, 2, 0.1), 'kernel': ['rbf']} 
param_svm = []

svc = SVC()
grid = GridSearchCV(svc,param_grid,cv=10)

for col in labels:
  grid.fit(x_train,y_train[col])
  best_params = grid.best_params_
  param_svm.append(best_params)
  print(best_params)


{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [81]:
param_svm

[{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}]

In [238]:
# saved the result
param_svm_result_saved = [{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
                          {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
                          {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}]

In [239]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

score_table = []
test_predictions_svc = []
train_predictions_svc = []

for i in range(len(labels)):
    svc = SVC(C=param_svm_result_saved[i]['C'], gamma=param_svm_result_saved[i]['gamma'])
    svc.fit(x_train,y_train[labels[i]])

    train_pred = svc.predict(x_train)
    train_predictions_svc.append(train_pred)

    test_pred = svc.predict(x_test)
    test_predictions_svc.append(test_pred)


    train_exact_match = accuracy_score(y_train[labels[i]], train_pred)
    train_hamming_loss = hamming_loss(y_train[labels[i]], train_pred)

    test_exact_match = accuracy_score(y_test[labels[i]], test_pred)
    test_hamming_loss = hamming_loss(y_test[labels[i]], test_pred)  

    score_table.append([labels[i], train_exact_match, train_hamming_loss,
                     test_exact_match, test_hamming_loss])



In [240]:
score_table = pd.DataFrame(score_table, columns = ['label','train exact match',
                                                   'train hamming loss',
                                                   'test exact match',
                                                   'test hamming loss'])
score_table

Unnamed: 0,label,train exact match,train hamming loss,test exact match,test hamming loss
0,Family,0.974583,0.025417,0.974062,0.025938
1,Genus,0.97915,0.02085,0.971746,0.028254
2,Species,0.983717,0.016283,0.972673,0.027327


In [246]:
total_train_exact_match = getExactMatch(train_predictions_svc, y_train, score_table, 0)
total_train_hamming_loss = getHammingLoss_(train_predictions_svc, y_train, score_table, 0)
print('overall train exact match : ', total_train_exact_match)
print('overall train hamming loss : ', total_train_hamming_loss)

total_test_exact_match = getExactMatch(test_predictions_svc, y_test, score_table, 1)
total_test_hamming_loss = getHammingLoss_(test_predictions_svc, y_test, score_table, 1)
print('overall test exact match : ', total_test_exact_match)
print('overall test hamming loss : ', total_test_hamming_loss)

  


overall train exact match :  0.9745830023828436
overall train hamming loss :  0.02084988085782367
overall test exact match :  0.9717461787864752
overall test hamming loss :  0.027173073953991038


## iii. Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [96]:
import warnings
from sklearn.svm import LinearSVC

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]} 
param_svm_L1 = []

l1_svm = LinearSVC(loss='l2', penalty='l1', dual=False, tol=0.001)
#warnings.filterwarnings("ignore", category=FutureWarning)

grid = GridSearchCV(l1_svm,param_grid,cv=10)
#warnings.filterwarnings("ignore", category=FutureWarning)

for col in labels:
    grid.fit(x_train,y_train[col])
    #  warnings.filterwarnings("ignore", category=FutureWarning)
    best_params = grid.best_params_
    param_svm_L1.append(best_params)
    print(best_params)





{'C': 10}




{'C': 10}




{'C': 10}




In [100]:
print(param_svm_L1)

[{'C': 10}, {'C': 10}, {'C': 10}]


In [128]:
param_svm_L1_saved = [{'C': 10}, {'C': 10}, {'C': 10}]

In [131]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.svm import LinearSVC

l1_score_table = []
test_predictions_l1_svm = []
train_predictions_l1_svm = []

for i in range(len(labels)):
    l1_svm = LinearSVC(C=param_svm_L1_saved[i]['C'], loss='l2', penalty='l1',
                     dual=False, tol=0.001)
    l1_svm.fit(x_train,y_train[labels[i]])
    train_pred = l1_svm.predict(x_train)
    train_predictions_l1_svm.append(train_pred)

    test_pred = l1_svm.predict(x_test)
    test_predictions_l1_svm.append(test_pred)

    train_exact_match = accuracy_score(y_train[labels[i]], train_pred)
    train_hamming_loss = hamming_loss(y_train[labels[i]], train_pred)
    test_exact_match = accuracy_score(y_test[labels[i]], test_pred)
    test_hamming_loss = hamming_loss(y_test[labels[i]], test_pred)  

    l1_score_table.append([labels[i], train_exact_match, train_hamming_loss,
                     test_exact_match, test_hamming_loss])



In [132]:
l1_score_table = pd.DataFrame(l1_score_table, columns = ['label','train exact match',
                                                   'train hamming loss',
                                                   'test exact match',
                                                   'test hamming loss'])
l1_score_table

Unnamed: 0,label,train exact match,train hamming loss,test exact match,test hamming loss
0,Family,0.940429,0.059571,0.93145,0.06855
1,Genus,0.958102,0.041898,0.948124,0.051876
2,Species,0.96664,0.03336,0.957388,0.042612


In [247]:
total_train_exact_match = getExactMatch(train_predictions_l1_svm, y_train, l1_score_table, 0)
total_train_hamming_loss = getHammingLoss_(train_predictions_l1_svm, y_train, l1_score_table, 0)
print('overall train exact match : ', total_train_exact_match)
print('overall train hamming loss : ', total_train_hamming_loss)

total_test_exact_match = getExactMatch(test_predictions_l1_svm, y_test, l1_score_table, 1)
total_test_hamming_loss = getHammingLoss_(test_predictions_l1_svm, y_test, l1_score_table, 1)
print('overall test exact match : ', total_test_exact_match)
print('overall test hamming loss : ', total_test_hamming_loss)

  


overall train exact match :  0.9404289118347895
overall train hamming loss :  0.04494307651575334
overall test exact match :  0.9314497452524316
overall test hamming loss :  0.05434614790798204


## iv. Repeat 1(b)ii with using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

### Answer : Here, SVM with Gaussian kernel had a higher accuracy than L-1 penalized SVM. And class imbalance didn't affect the result.

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000], 'gamma': np.arange(0.1, 2, 0.1), 'kernel': ['rbf']} 
param_svm_smote = []

svc_smote = SVC(class_weight='balanced')
grid = GridSearchCV(svc_smote,param_grid,cv=10)

for col in labels:
  grid.fit(x_train,y_train[col])
  best_params = grid.best_params_
  param_svm_smote.append(best_params)
  print(best_params)



{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}




{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}




{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}


In [0]:
#param_svm_smote

In [10]:
param_svm_smote_s = param_svm_smote
param_svm_smote_s

[{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
 {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'},
 {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}]

In [12]:
#saved the result
param_svm_smote_saved = [{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},
                         {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'},
                         {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}]

In [133]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.svm import SVC

smote_score_table = []
test_predictions_svc_smote = []
train_predictions_svc_smote = []

for i in range(len(labels)):
    svc_smote = SVC(C=param_svm_smote_saved[i]['C'],
            gamma=param_svm_smote_saved[i]['gamma'],
           class_weight='balanced')
    svc_smote.fit(x_train,y_train[labels[i]])
    train_pred = svc_smote.predict(x_train)
    train_predictions_svc_smote.append(train_pred)
    
    test_pred = svc_smote.predict(x_test)
    test_predictions_svc_smote.append(test_pred)

    train_exact_match = accuracy_score(y_train[labels[i]], train_pred)
    train_hamming_loss = hamming_loss(y_train[labels[i]], train_pred)
    test_exact_match = accuracy_score(y_test[labels[i]], test_pred)
    test_hamming_loss = hamming_loss(y_test[labels[i]], test_pred)  

    smote_score_table.append([labels[i], train_exact_match, train_hamming_loss,
                     test_exact_match, test_hamming_loss])

In [134]:
smote_score_table = pd.DataFrame(smote_score_table, columns = ['label','train exact match',
                                                   'train hamming loss',
                                                   'test exact match',
                                                   'test hamming loss'])
smote_score_table

Unnamed: 0,label,train exact match,train hamming loss,test exact match,test hamming loss
0,Family,1.0,0.0,0.989347,0.010653
1,Genus,1.0,0.0,0.984715,0.015285
2,Species,1.0,0.0,0.98101,0.01899


In [248]:
total_train_exact_match = getExactMatch(train_predictions_svc_smote, y_train, smote_score_table, 0)
total_train_hamming_loss = getHammingLoss_(train_predictions_svc_smote, y_train, smote_score_table, 0)
print('overall train exact match : ', total_train_exact_match)
print('overall train hamming loss : ', total_train_hamming_loss)

total_test_exact_match = getExactMatch(test_predictions_svc_smote, y_test, smote_score_table, 1)
total_test_hamming_loss = getHammingLoss_(test_predictions_svc_smote, y_test, smote_score_table, 1)
print('overall test exact match : ', total_test_exact_match)
print('overall test hamming loss : ', total_test_hamming_loss)

  


overall train exact match :  1.0
overall train hamming loss :  0.0
overall test exact match :  0.9810097267253358
overall test hamming loss :  0.014976069167824615


## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set
### Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming distances that you calculate.

## (a) Use k-means clustering on the whole Anuran Calls (MFCCs)Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise.) Choose k => {1,2,...,50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method you know.

In [70]:
from sklearn.preprocessing import StandardScaler

x = df.loc[:,:'MFCCs_22']
y = df.loc[:,'Family':'Species']
print(x.describe())


scaler = StandardScaler()
#print('before standardize : ', x)
x = scaler.fit_transform(x)
x = pd.DataFrame(x)

#mms = MinMaxScaler()
#mms.fit(x)
#x = mms.transform(x)
#x = pd.DataFrame(x)

print(x.describe())


          MFCCs_ 1     MFCCs_ 2     MFCCs_ 3     MFCCs_ 4     MFCCs_ 5  \
count  7195.000000  7195.000000  7195.000000  7195.000000  7195.000000   
mean      0.989885     0.323584     0.311224     0.445997     0.127046   
std       0.069016     0.218653     0.263527     0.160328     0.162722   
min      -0.251179    -0.673025    -0.436028    -0.472676    -0.636012   
25%       1.000000     0.165945     0.138445     0.336737     0.051717   
50%       1.000000     0.302184     0.274626     0.481463     0.161361   
75%       1.000000     0.466566     0.430695     0.559861     0.222592   
max       1.000000     1.000000     1.000000     1.000000     0.752246   

          MFCCs_ 6     MFCCs_ 7     MFCCs_ 8     MFCCs_ 9     MFCCs_10  ...  \
count  7195.000000  7195.000000  7195.000000  7195.000000  7195.000000  ...   
mean      0.097939    -0.001397    -0.000370     0.128213     0.055998  ...   
std       0.120412     0.171404     0.116302     0.179008     0.127099  ...   
min      -0.41041

In [86]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

range_n_clusters = list(range(2, 51))
Sum_of_squared_distances = []
#best_K = [0,-2]
best_Ks = []
predict_label_sets = []

for i in range(50):
    best_K = [0,-2]
    predict_label = []
    for n_clusters in range_n_clusters:
        # Initialize the clusterer with n_clusters value
        clusterer = KMeans(n_clusters=n_clusters)
        cluster_labels = clusterer.fit_predict(x)
        silhouette_avg = silhouette_score(x, cluster_labels)
        
        if best_K[1] < silhouette_avg:
            best_K[0] = n_clusters
            best_K[1] = silhouette_avg
            predict_label = cluster_labels
        #print('[K, silhouette score] : ', best_K)
        # Compute the silhouette scores for each sample
        #sample_silhouette_values = silhouette_samples(x, cluster_labels)
        #Sum_of_squared_distances.append(clusterer.inertia_)
    predict_label_sets.append(predict_label)
    best_Ks.append(best_K)
    
#plt.plot(range_n_clusters, Sum_of_squared_distances, 'bx-')
#plt.xlabel('k')
#plt.ylabel('Sum_of_squared_distances')
#plt.title('Scree plot')
#plt.show()


#print(best_Ks)

In [94]:
len(predict_label_sets)

50

In [97]:
import pickle
with open('mypickle.pickle', 'wb') as f:
    pickle.dump(best_Ks, f)
    
with open('predictSets.pickle', 'wb') as f:
    pickle.dump(predict_label_sets, f)

In [105]:
with open('mypickle.pickle', 'rb') as f:
    best_Ks_saved = pickle.load(f)

with open('predictSets.pickle', 'rb') as f:
    predict_sets_saved = pickle.load(f)
#print(predict_sets[0])

In [221]:
for i in range(50):
    print(i+1, best_Ks[i])

1 [3, 0.35645091270652085]
2 [3, 0.35645091270652085]
3 [3, 0.35645091270652085]
4 [3, 0.35645091270652085]
5 [3, 0.35645091270652085]
6 [3, 0.35645091270652085]
7 [4, 0.3593932701327264]
8 [4, 0.35310938795454255]
9 [3, 0.35645091270652085]
10 [3, 0.35645091270652085]
11 [4, 0.3593932701327264]
12 [3, 0.35645091270652085]
13 [3, 0.35645091270652085]
14 [4, 0.3593932701327264]
15 [3, 0.35645091270652085]
16 [4, 0.3593932701327264]
17 [4, 0.3593932701327264]
18 [3, 0.35645091270652085]
19 [3, 0.35645091270652085]
20 [3, 0.35645091270652085]
21 [3, 0.35645091270652085]
22 [4, 0.3593932701327264]
23 [3, 0.35645091270652085]
24 [3, 0.35645091270652085]
25 [3, 0.35645091270652085]
26 [3, 0.35645091270652085]
27 [3, 0.35645091270652085]
28 [3, 0.35645091270652085]
29 [3, 0.35645091270652085]
30 [3, 0.35645091270652085]
31 [4, 0.3593932701327264]
32 [3, 0.35645091270652085]
33 [3, 0.35645091270652085]
34 [3, 0.35645091270652085]
35 [3, 0.35645091270652085]
36 [3, 0.35645091270652085]
37 [4, 0

In [212]:
#print(predict_sets_saved[0])
#temp = y.copy()
#temp['cluster'] = predict_sets_saved[0]
#temp

In [17]:
#print(Sum_of_squared_distances)
#for i in range (0,50):
#  slope = abs(Sum_of_squared_distances[i-1] - Sum_of_squared_distances[i])
#  print (i, slope)

## (b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus and species.

In [209]:
def getMajorityTable(bestK, table):

    labels_classes = [['Bufonidae', 'Dendrobatidae', 'Hylidae', 'Leptodactylidae'],
                     ['Adenomera', 'Ameerega', 'Dendropsophus', 'Hypsiboas',
                      'Leptodactylus', 'Osteocephalus', 'Rhinella', 'Scinax'],
                     ['AdenomeraAndre', 'AdenomeraHylaedactylus',
                      'Ameeregatrivittata', 'HylaMinuta','HypsiboasCinerascens',
                      'HypsiboasCordobae', 'LeptodactylusFuscus', 
                      'OsteocephalusOophagus','Rhinellagranulosa', 'ScinaxRuber']]

    #cluster_table = []
    majority_cluster_table = []
    maxNumOfRows = 0
    labels = ['Family', 'Genus', 'Species']
    #print('label classes:', len(labels_classes))
    
    #i : family, genus, species
    for i in range(len(labels_classes)):
        #print('i:', i)
        # bestK = num of clusters
        for n in range(bestK):
            #print('n:',n)
            maxNumOfRows = 0
            # j : classes in each label
            for j in range(len(labels_classes[i])):
                #print('j:', j)
                series = table.apply(lambda x: True if (x[labels[i]] == labels_classes[i][j] and 
                                                  x['cluster'] == n) else False , axis=1)
                # Count number of True in series
                numOfRows = len(series[series == True].index)
                if maxNumOfRows < numOfRows:
                    maxNumOfRows = numOfRows
                    maxClass = labels_classes[i][j]
            majority_cluster_table.append([labels[i], n, maxClass, maxNumOfRows])

        #cluster_table = pd.DataFrame(cluster_table, columns = ['label', 'cluster',
        #                                                       'class', 'count'])
        
    majority_cluster_table = pd.DataFrame(majority_cluster_table,
                                              columns = ['label', 'cluster',
                                                         'max class', 'maxcount'])
    #print(cluster_table)
    return majority_cluster_table

In [210]:
majority_table = []
for i in range(50):
    temp = y.copy()
    temp['cluster'] = predict_sets_saved[i]
    #print(temp)
    oneMajority = getMajorityTable(best_Ks_saved[i][0], temp)
    majority_table.append(oneMajority)

In [211]:
for i in range(50):
    print(majority_table[i],'\n')

     label  cluster               max class  maxcount
0   Family        0         Leptodactylidae      3463
1   Family        1                 Hylidae       594
2   Family        2                 Hylidae      1460
3    Genus        0               Adenomera      3463
4    Genus        1               Hypsiboas       533
5    Genus        2               Hypsiboas      1043
6  Species        0  AdenomeraHylaedactylus      3463
7  Species        1    HypsiboasCinerascens       454
8  Species        2       HypsiboasCordobae      1025 

     label  cluster               max class  maxcount
0   Family        0                 Hylidae      1460
1   Family        1         Leptodactylidae      3463
2   Family        2                 Hylidae       594
3    Genus        0               Hypsiboas      1043
4    Genus        1               Adenomera      3463
5    Genus        2               Hypsiboas       533
6  Species        0       HypsiboasCordobae      1025
7  Species        1  Adeno

In [214]:
import pickle
with open('majority.pickle', 'wb') as f:
    pickle.dump(majority_table, f)

In [215]:
with open('majority.pickle', 'rb') as f:
    majority_saved = pickle.load(f)

In [217]:
#for i in range(50):
    #print(majority_saved[i],'\n')