In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.multioutput import ClassifierChain

In [3]:
cwd = os.getcwd()
print(cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in '%s': %s" % (cwd, files))

os.chdir(cwd)
data_df = pd.read_csv('Frogs_MFCCs.csv')
#print(data_df)
X = data_df.loc[:, :'MFCCs_22']
y = data_df.loc[:, 'Family':'Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

warnings.filterwarnings("ignore", category=DeprecationWarning) 
class_labels = ["Family","Genus","Species"]

/home/jovyan/binder
Files in '/home/jovyan/binder': ['environment.yml', 'Index.ipynb', 'apt.txt', 'HW 6 - Somasekhar Suryadevara - 3461071540.ipynb', 'Frogs_MFCCs.csv', '.ipynb_checkpoints']
(5036, 22)
(2159, 22)
(5036, 3)
(2159, 3)


In [3]:
best_params = []
for cla in class_labels: 
    C_range = np.logspace(1, 3, 10)
    gamma_range = np.logspace(-1, 2, 10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(kernel = 'rbf', decision_function_shape = 'ovr'), param_grid=param_grid, cv=cv)
    grid.fit(X_train, y_train.loc[:,cla])
    best_params.append(grid.best_params_)
    #print(grid.cv_results_)
    print('fit complete')
    print("The best parameters are %s with an average cross-validated score of %0.2f for classifying the label %s" % (grid.best_params_, grid.best_score_, cla))

multi_label = []
#best_params[i]["gamma"]
for i in range(3):
    gaussian_svm = SVC(C = best_params[i]["C"], kernel = 'rbf', decision_function_shape = 'ovr', gamma = best_params[i]["gamma"])
    gaussian_svm.fit(X_train, y_train.loc[:,class_labels[i]])
    y_pred = gaussian_svm.predict(X_test)
    multi_label.append(y_pred.tolist())
    print('Test accuracy while classifying %s is %0.2f' % (class_labels[i], accuracy_score(y_test.loc[:,class_labels[i]], y_pred)))

multi_label = np.asarray(multi_label)
multi_label_test = multi_label.reshape(3,-1)
multi_label_test = np.transpose(multi_label_test)
#multi_label_test = pd.DataFrame(multi_label)
#print(multi_label_test)
y_test_n = y_test.values

tot = 0
for i in range(multi_label_test.shape[0]):
    if(multi_label_test[i][0] == y_test_n[i][0] and multi_label_test[i][1] == y_test_n[i][1] and multi_label_test[i][2] == y_test_n[i][2]):
        tot = tot + 1

exact_match = float(tot)/float(multi_label_test.shape[0])
print('Exact match = %0.5f' % (exact_match))

#multi_label_test = pd.DataFrame(multi_label_test)
print('Hamming loss = %0.5f' % (np.sum(np.not_equal(multi_label_test, y_test_n))/float(y_test_n.size)))

fit complete
The best parameters are {'C': 10.0, 'gamma': 3.1622776601683795} with an average cross-validated score of 0.99 for classifying the label Family
fit complete
The best parameters are {'C': 10.0, 'gamma': 3.1622776601683795} with an average cross-validated score of 0.99 for classifying the label Genus
fit complete
The best parameters are {'C': 10.0, 'gamma': 3.1622776601683795} with an average cross-validated score of 0.99 for classifying the label Species
Test accuracy while classifying Family is 1.00
Test accuracy while classifying Genus is 0.99
Test accuracy while classifying Species is 0.99
Exact match = 0.98749
Hamming loss = 0.00880


In [4]:
best_params_L1 = []
for cla in class_labels: 
    C_range_L1 = np.logspace(-1, 2, 10)
    param_grid_L1 = dict(C=C_range_L1)
    cv_L1 = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    grid_L1 = GridSearchCV(LinearSVC(penalty = 'l1', loss = 'l2', multi_class = 'ovr', dual = False), param_grid=param_grid_L1, cv=cv_L1)
    grid_L1.fit(X_train, y_train.loc[:,cla])
    #print(grid_L1.cv_results_)
    best_params_L1.append(grid_L1.best_params_)
    print('fit complete')
    print("The best parameters are %s with an average cross-validated score of %0.2f for classifying the label %s using L1 SVM" % (grid_L1.best_params_, grid_L1.best_score_, cla))

multi_label_L1 = []
#best_params_L1[i]["C"]
for i in range(3):
    L1_svm = LinearSVC(penalty = 'l1', loss = 'l2', C = best_params_L1[i]["C"], multi_class = 'ovr', dual = False)
    L1_svm.fit(X_train, y_train.loc[:,class_labels[i]])
    y_pred_L1 = L1_svm.predict(X_test)
    multi_label_L1.append(y_pred_L1.tolist())
    print('Test accuracy while classifying %s using L1 regulairized SVM is %0.2f' % (class_labels[i], accuracy_score(y_test.loc[:,class_labels[i]], y_pred_L1)))

multi_label_L1 = np.asarray(multi_label_L1)
multi_label_test_L1 = multi_label_L1.reshape(3,-1)
multi_label_test_L1 = np.transpose(multi_label_test_L1)
#multi_label_test = pd.DataFrame(multi_label)
#print(multi_label_test)
y_test_n = y_test.values

tot = 0
for i in range(multi_label_test_L1.shape[0]):
    if(multi_label_test_L1[i][0] == y_test_n[i][0] and multi_label_test_L1[i][1] == y_test_n[i][1] and multi_label_test_L1[i][2] == y_test_n[i][2]):
        tot = tot + 1

exact_match_L1 = float(tot)/float(multi_label_test_L1.shape[0])
print('Exact match for L1 svm = %0.5f' % (exact_match_L1))

#multi_label_test = pd.DataFrame(multi_label_test)
print('Hamming loss for L1 svm = %0.5f' % (np.sum(np.not_equal(multi_label_test_L1, y_test_n))/float(y_test_n.size)))

fit complete
The best parameters are {'C': 3.1622776601683795} with an average cross-validated score of 0.94 for classifying the label Family using L1 SVM
fit complete
The best parameters are {'C': 100.0} with an average cross-validated score of 0.95 for classifying the label Genus using L1 SVM
fit complete
The best parameters are {'C': 17.78279410038923} with an average cross-validated score of 0.96 for classifying the label Species using L1 SVM
Test accuracy while classifying Family using L1 regulairized SVM is 0.93
Test accuracy while classifying Genus using L1 regulairized SVM is 0.94
Test accuracy while classifying Species using L1 regulairized SVM is 0.96
Exact match for L1 svm = 0.91061
Hamming loss for L1 svm = 0.05697


In [7]:
y1 = []
for val in y_train.loc[:,"Family"]:
    if(val == "Bufonidae"):
        y1.append(0)
    elif(val == "Dendrobatidae"):
        y1.append(1)
    elif(val == "Hylidae"):
        y1.append(2)
    else:
        y1.append(3)

y2 = []
for val in y_train.loc[:,"Genus"]:
    if(val == "Adenomera"):
        y2.append(0)
    elif(val == "Ameerega"):
        y2.append(1)
    elif(val == "Dendropsophus"):
        y2.append(2)
    elif(val == "Hypsiboas"):
        y2.append(3)
    elif(val == "Leptodactylus"):
        y2.append(4)
    elif(val == "Osteocephalus"):
        y2.append(5)
    elif(val == "Rhinella"):
        y2.append(6)
    else:
        y2.append(7)

y3 = []
for val in y_train.loc[:,"Species"]:
    if(val == "AdenomeraAndre"):
        y3.append(0)
    elif(val == "AdenomeraHylaedactylus"):
        y3.append(1)
    elif(val == "Ameeregatrivittata"):
        y3.append(2)
    elif(val == "HylaMinuta"):
        y3.append(3)
    elif(val == "HypsiboasCinerascens"):
        y3.append(4)
    elif(val == "HypsiboasCordobae"):
        y3.append(5)
    elif(val == "LeptodactylusFuscus"):
        y3.append(6)
    elif(val == "OsteocephalusOophagus"):
        y3.append(7)
    elif(val == "Rhinellagranulosa"):
        y3.append(8)
    else:
        y3.append(9)

Y_train = []
Y_train.append(y1)
Y_train.append(y2)
Y_train.append(y3)
Y_train = np.asarray(Y_train)
Y_train = Y_train.reshape(3,-1)
Y_train = np.transpose(Y_train)

y1 = []
for val in y_test.loc[:,"Family"]:
    if(val == "Bufonidae"):
        y1.append(0)
    elif(val == "Dendrobatidae"):
        y1.append(1)
    elif(val == "Hylidae"):
        y1.append(2)
    else:
        y1.append(3)

y2 = []
for val in y_test.loc[:,"Genus"]:
    if(val == "Adenomera"):
        y2.append(0)
    elif(val == "Ameerega"):
        y2.append(1)
    elif(val == "Dendropsophus"):
        y2.append(2)
    elif(val == "Hypsiboas"):
        y2.append(3)
    elif(val == "Leptodactylus"):
        y2.append(4)
    elif(val == "Osteocephalus"):
        y2.append(5)
    elif(val == "Rhinella"):
        y2.append(6)
    else:
        y2.append(7)

y3 = []
for val in y_test.loc[:,"Species"]:
    if(val == "AdenomeraAndre"):
        y3.append(0)
    elif(val == "AdenomeraHylaedactylus"):
        y3.append(1)
    elif(val == "Ameeregatrivittata"):
        y3.append(2)
    elif(val == "HylaMinuta"):
        y3.append(3)
    elif(val == "HypsiboasCinerascens"):
        y3.append(4)
    elif(val == "HypsiboasCordobae"):
        y3.append(5)
    elif(val == "LeptodactylusFuscus"):
        y3.append(6)
    elif(val == "OsteocephalusOophagus"):
        y3.append(7)
    elif(val == "Rhinellagranulosa"):
        y3.append(8)
    else:
        y3.append(9)

Y_test = []
Y_test.append(y1)
Y_test.append(y2)
Y_test.append(y3)
Y_test = np.asarray(Y_test)
Y_test = Y_test.reshape(3,-1)
Y_test = np.transpose(Y_test)
print(Y_test)


chains = [ClassifierChain(gaussian_svm, order='random', random_state=i) for i in range(3)]
for chain in chains:
    chain.fit(X_train, Y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in chains])

chain_acc = []
for i in range(3):
    tot_chain = 0
    for j in range(Y_test.shape[0]):
        if((Y_pred_chains[i][j][0] == Y_test[j][0]) and (Y_pred_chains[i][j][1] == Y_test[j][1]) and  (Y_pred_chains[i][j][2] == Y_test[j][2])):
            tot_chain = tot_chain + 1
    acc = tot_chain/float(Y_test.shape[0])
    chain_acc.append(acc)
#chain_hamming_loss = [accuracy_score(Y_test.tolist(), Y_pred_chain.tolist()) for Y_pred_chain in Y_pred_chains]
Y_pred_ensemble = Y_pred_chains.mean(axis=0)
ensemble_acc = 0
tot_en = 0
for k in range(Y_test.shape[0]):
    if((Y_pred_ensemble[k][0] == Y_test[k][0]) and (Y_pred_ensemble[k][1] == Y_test[k][1]) and  (Y_pred_ensemble[k][2] == Y_test[k][2])):
        tot_en = tot_en + 1
ensemble_acc = tot_en/float(Y_test.shape[0])

print('Classifier Chain accuracy through stages')
print(chain_acc)
print('Ensemble classifier chain accuracy')
print(ensemble_acc)
print('Hamming loss for gaussian svm %0.5f' % (np.sum(np.not_equal(Y_pred_ensemble, Y_test))/float(3*Y_test.shape[0])))

chains_L1 = [ClassifierChain(LinearSVC(penalty = 'l1', loss = 'l2', C = 10, multi_class = 'ovr', dual = False), order='random', random_state=i) for i in range(4)]
for chain in chains_L1:
    chain.fit(X_train, Y_train)

Y_pred_chains_L1 = np.array([chain.predict(X_test) for chain in chains_L1])

chain_acc_L1 = []
for i in range(3):
    tot_chain = 0
    for j in range(Y_test.shape[0]):
        if((Y_pred_chains_L1[i][j][0] == Y_test[j][0]) and (Y_pred_chains_L1[i][j][1] == Y_test[j][1]) and  (Y_pred_chains_L1[i][j][2] == Y_test[j][2])):
            tot_chain = tot_chain + 1
    acc = tot_chain/float(Y_test.shape[0])
    chain_acc_L1.append(acc)

Y_pred_ensemble_L1 = Y_pred_chains_L1.mean(axis=0)
ensemble_acc_L1 = 0
tot_en = 0
for k in range(Y_test.shape[0]):
    if((Y_pred_ensemble_L1[k][0] == Y_test[k][0]) and (Y_pred_ensemble_L1[k][1] == Y_test[k][1]) and  (Y_pred_ensemble_L1[k][2] == Y_test[k][2])):
        tot_en = tot_en + 1
ensemble_acc_L1 = tot_en/float(Y_test.shape[0])

print('Classifier Chain accuracy through stages for L1 SVM')
print(chain_acc_L1)
print('Ensemble classifier chain accuracy with L1 SVM base classifier')
print(ensemble_acc_L1)
print('Hamming loss for linear svm %0.4f' % (np.sum(np.not_equal(Y_pred_ensemble_L1, Y_test))/float(3*Y_test.shape[0])))

[[3 0 1]
 [2 3 4]
 [2 3 5]
 ...
 [3 0 1]
 [3 0 1]
 [3 0 1]]
Classifier Chain accuracy through stages
[0.9884205650764243, 0.9888837424733673, 0.9884205650764243]
Ensemble classifier chain accuracy
0.9884205650764243
Hamming loss for gaussian svm 0.00957
Classifier Chain accuracy through stages for L1 SVM
[0.9383974062065771, 0.9180176007410839, 0.937471051412691]
Ensemble classifier chain accuracy with L1 SVM base classifier
0.9073645206113942
Hamming loss for linear svm 0.0828
