#                                           Anuran Species Recognition                   
###                                                                                                                                                       Shreyas Dinesh Patil

(a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs). Choose 70% of the data randomly as the training set.

(b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-class classification is to train a classifier for each label. We first try this approach:

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import math
from imblearn.over_sampling import SMOTE

In [5]:
df = pd.read_csv('Frogs_MFCCs.csv')
X = df.drop(labels=['RecordID', 'Family', 'Genus', 'Species'] , axis=1)

y = pd.DataFrame(df, columns = ['Family', 'Genus', 'Species'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#Train labels
y1_trn = y_train['Family']
y2_trn = y_train['Genus']
y3_trn = y_train['Species']
#Test labels
y1_tst = y_test['Family']
y2_tst = y_test['Genus']
y3_tst = y_test['Species']

#Encoding labels

y1_trn = y1_trn.astype('category')
y2_trn = y2_trn.astype('category')
y3_trn = y3_trn.astype('category')
y1_trnc = y1_trn.cat.codes
y2_trnc = y2_trn.cat.codes
y3_trnc = y3_trn.cat.codes

y1_tst = y1_tst.astype('category')
y2_tst = y2_tst.astype('category')
y3_tst = y3_tst.astype('category')
y1_tstc = y1_tst.cat.codes
y2_tstc = y2_tst.cat.codes
y3_tstc = y3_tst.cat.codes
y_tstc = pd.concat([y1_tstc, y2_tstc, y3_tstc], axis=1)

(ii) Training a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determining the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation.

In [9]:
#For all labels

best_score1 = 0
best_score2 = 0
best_score3 = 0
accuracy_score1 = list()
accuracy_score2 = list()
accuracy_score3 = list()

for log_c in range(-3,4):
    sigma = .1
    while sigma<=2:  
        g = 1/(2*(sigma**2))
        c = 10**log_c
        clfr = svm.SVC(C=c, decision_function_shape='ovr', gamma=g)
        accuracy_score1.append(cross_val_score(clfr, X_train, y1_trnc, cv=10, scoring='accuracy').mean())
        accuracy_score2.append(cross_val_score(clfr, X_train, y2_trnc, cv=10, scoring='accuracy').mean())
        accuracy_score3.append(cross_val_score(clfr, X_train, y3_trnc, cv=10, scoring='accuracy').mean())
        accuracy1 = max(accuracy_score1)
        accuracy2 = max(accuracy_score2)
        accuracy3 = max(accuracy_score3)
        sigma = sigma + .1
        if accuracy1 > best_score1:
            best_score1 = accuracy1
            best_c1 = c
            best_gamma1 = g
        if accuracy2 > best_score2:
            best_score2 = accuracy2
            best_c2 = c
            best_gamma2 = g
        if accuracy3 > best_score3:
            best_score3 = accuracy3
            best_c3 = c
            best_gamma3 = g
        
print('The weight of SVM penalty for 1st classifier is:', best_c1)
print('The width of the Gaussian kernel for 1st classifier is:', best_gamma1)

print('The weight of SVM penalty for 2nd classifier is:', best_c2)
print('The width of the Gaussian kernel for 2nd classifier is:', best_gamma2)

print('The weight of SVM penalty for 3rd classifier is:', best_c3)
print('The width of the Gaussian kernel for 3rd classifier is:', best_gamma3)

The weight of SVM penalty for 1st classifier is: 100
The width of the Gaussian kernel for 1st classifier is: 2.0
The weight of SVM penalty for 2nd classifier is: 10
The width of the Gaussian kernel for 2nd classifier is: 1.3888888888888888
The weight of SVM penalty for 3rd classifier is: 10
The width of the Gaussian kernel for 3rd classifier is: 1.3888888888888888


In [10]:
#Training for label1
clfr = svm.SVC(C=best_c1, decision_function_shape='ovr', gamma=best_gamma1)
clfr.fit(X_train, y1_trnc)

#Prediction for label1
y1pred = clfr.predict(X_test)

#Training for label2
clfr = svm.SVC(C=best_c2, decision_function_shape='ovr', gamma=best_gamma2)
clfr.fit(X_train, y2_trnc)

#Prediction for label2
y2pred = clfr.predict(X_test)

#Training for label3
clfr = svm.SVC(C=best_c3, decision_function_shape='ovr', gamma=best_gamma3)
clfr.fit(X_train, y3_trnc)

#Prediction for label3
y3pred = clfr.predict(X_test)

#Dataframe of predicted labels
y1pred = pd.DataFrame(y1pred, columns= [0])
y2pred = pd.DataFrame(y2pred, columns= [1])
y3pred = pd.DataFrame(y3pred, columns= [2])

ypred = pd.concat([y1pred, y2pred, y3pred], axis=1)

In [11]:
#Function for Hamming_scores and exact_match

def hamming_score(y_tstc, ypred):
    loss = 0
    for i in range(0, len(y_tstc)):
        losscore = 0
        for j in range(0, 3):
            if y_tstc.iloc[i, j] != ypred.iloc[i, j]:
               losscore = losscore + 1 
            losscore = losscore/3
            loss = loss + losscore
    loss = loss/(len(y_tstc))
    return(loss)

def exact_match(y_tstc, ypred):
    exactmatch = 0
    for i in range(0, len(y_tstc)):
        if ypred.iloc[i, 0]==y_tstc.iloc[i, 0] and ypred.iloc[i, 1]==y_tstc.iloc[i, 1] and ypred.iloc[i, 2]==y_tstc.iloc[i, 2]:
            exactmatch = exactmatch + 1
    exactmatch = exactmatch/(len(y_tstc))
    return exactmatch

In [12]:
#Evaluating the classifier

#Hamming loss
hmloss = hamming_score(y_tstc, ypred)
print('The Hamming loss for svm with gaussian kernel is', hmloss)

#Exact match
exactscore = exact_match(y_tstc, ypred)
print('The exact match for svm with gaussian kernel is', exactscore)

The Hamming loss for svm with gaussian kernel is 0.009949736675072474
The exact match for svm with gaussian kernel is 0.9874942102825383


b(iii) Repeat 1(b)ii with L1-penalized SVMs. 

In [13]:
#For all labels

best_score1 = 0
best_score2 = 0
best_score3 = 0
accuracy_score1 = list()
accuracy_score2 = list()
accuracy_score3 = list()

for log_c in range(-3,4):
    c = 10**log_c
    linclfr = svm.LinearSVC(C=c, penalty='l1', dual=False, multi_class='ovr')
    accuracy_score1.append(cross_val_score(linclfr, X_train, y1_trnc, cv=10, scoring='accuracy').mean())
    accuracy_score2.append(cross_val_score(linclfr, X_train, y2_trnc, cv=10, scoring='accuracy').mean())
    accuracy_score3.append(cross_val_score(linclfr, X_train, y3_trnc, cv=10, scoring='accuracy').mean())
    accuracy1 = max(accuracy_score1)
    accuracy2 = max(accuracy_score2)
    accuracy3 = max(accuracy_score3)
    if accuracy1>best_score1:
       best_score1 = accuracy1
       best_c1 = c
    if accuracy2>best_score2:
       best_score2 = accuracy2
       best_c2 = c
    if accuracy3>best_score3:
       best_score3 = accuracy3
       best_c3 = c
        

print('The weight of L1-penalized SVM for 1st classifier is:', best_c1)
print('The weight of L1-penalized SVM for 2nd classifier is:', best_c2)
print('The weight of L1-penalized SVM for 3rd classifier is:', best_c3)

The weight of L1-penalized SVM for 1st classifier is: 100
The weight of L1-penalized SVM for 2nd classifier is: 10
The weight of L1-penalized SVM for 3rd classifier is: 10


In [23]:
#Training for label1
linclfr1 = svm.LinearSVC(C=best_c1, penalty='l1', dual=False, multi_class='ovr')
linclfr1.fit(X_train, y1_trnc)

#Prediction for label1
y1pred = linclfr1.predict(X_test)

#Training for label2
linclfr2 = svm.LinearSVC(C=best_c2, penalty='l1', dual=False, multi_class='ovr')
linclfr2.fit(X_train, y2_trnc)

#Prediction for label2
y2pred = linclfr2.predict(X_test)

#Training for label3
linclfr3 = svm.LinearSVC(C=best_c3, penalty='l1', dual=False, multi_class='ovr')
linclfr3.fit(X_train, y3_trnc)

#Prediction for label3
y3pred = linclfr3.predict(X_test)

#Dataframe of predicted labels
y1pred = pd.DataFrame(y1pred, columns= [0])
y2pred = pd.DataFrame(y2pred, columns= [1])
y3pred = pd.DataFrame(y3pred, columns= [2])

ypred = pd.concat([y1pred, y2pred, y3pred], axis=1)

In [24]:
#Evaluating the classifier

#Hamming loss
hmloss = hamming_score(y_tstc, ypred)
print('The Hamming loss for Linear SVC is', hmloss)

#Exact match
exactscore = exact_match(y_tstc, ypred)
print('The exact match for Linear SVC is', exactscore)

The Hamming loss for Linear SVC is 0.0653423224057778
The exact match for Linear SVC is 0.9189439555349699


b(iv) Repeat 1(b)iii by using SMOTE to counter class imbalance.

In [16]:
#SMOTE for counteracting class imbalance
sm = SMOTE(kind='svm')
X_trnew1, y_trnew1 = sm.fit_sample(X_train, y1_trnc)
X_trnew2, y_trnew2 = sm.fit_sample(X_train, y2_trnc)
X_trnew3, y_trnew3 = sm.fit_sample(X_train, y3_trnc)


#For all labels

best_score1 = 0
best_score2 = 0
best_score3 = 0
accuracy_score1 = list()
accuracy_score2 = list()
accuracy_score3 = list()

for log_c in range(-3,4):
    c = 10**log_c
    linclfr = svm.LinearSVC(C=c, penalty='l1', dual=False, multi_class='ovr')
    accuracy_score1.append(cross_val_score(linclfr, X_trnew1, y_trnew1, cv=10, scoring='accuracy').mean())
    accuracy_score2.append(cross_val_score(linclfr, X_trnew2, y_trnew2, cv=10, scoring='accuracy').mean())
    accuracy_score3.append(cross_val_score(linclfr, X_trnew3, y_trnew3, cv=10, scoring='accuracy').mean())
    accuracy1 = max(accuracy_score1)
    accuracy2 = max(accuracy_score2)
    accuracy3 = max(accuracy_score3)
    if accuracy1>best_score1:
       best_score1 = accuracy1
       best_newc1 = c
    if accuracy2>best_score2:
       best_score2 = accuracy2
       best_newc2 = c
    if accuracy3>best_score3:
       best_score3 = accuracy3
       best_newc3 = c
        
print('The weight of L1-penalized SVM for 1st classifier is:', best_newc1)
print('The weight of L1-penalized SVM for 2nd classifier is:', best_newc2)
print('The weight of L1-penalized SVM for 3rd classifier is:', best_newc3)

The weight of L1-penalized SVM for 1st classifier is: 10
The weight of L1-penalized SVM for 2nd classifier is: 1000
The weight of L1-penalized SVM for 3rd classifier is: 100


In [25]:
#Training for label1
linclfr1 = svm.LinearSVC(C=best_newc1, penalty='l1', dual=False, multi_class='ovr')
linclfr1.fit(X_trnew1, y_trnew1)
#Prediction for label1
y1pred = linclfr1.predict(X_test)

#Training for label2
linclfr2 = svm.LinearSVC(C=best_newc2, penalty='l1', dual=False, multi_class='ovr')
linclfr2.fit(X_trnew2, y_trnew2)
#Prediction for label2
y2pred = linclfr2.predict(X_test)

#Training for label3
linclfr3 = svm.LinearSVC(C=best_newc3, penalty='l1', dual=False, multi_class='ovr')
linclfr3.fit(X_trnew3, y_trnew3)
#Prediction for label3
y3pred = linclfr3.predict(X_test)

#Dataframe of predicted labels
y1pred = pd.DataFrame(y1pred, columns= [0])
y2pred = pd.DataFrame(y2pred, columns= [1])
y3pred = pd.DataFrame(y3pred, columns= [2])

ypred = pd.concat([y1pred, y2pred, y3pred], axis=1)

In [26]:
#Evaluating the classifier
#Hamming loss
hmloss = hamming_score(y_tstc, ypred)
print('The Hamming loss for Linear SVC is', hmloss)

#Exact match
exactscore = exact_match(y_tstc, ypred)
print('The exact match for Linear SVC is', exactscore)

The Hamming loss for Linear SVC is 0.11258641689396813
The exact match for Linear SVC is 0.8360352014821677


Conclusions of classifiers:

For SMV SVC:
The weight of SVM penalty for 1st classifier is 100
The weight of SVM penalty for 2nd classifier is 10
The weight of SVM penalty for 3rd classifier is 10

Gamma = 1/(2*(sigma^2)) where sigma is the width of the Gaussian kernel
Gamma of the Gaussian kernel for 1st classifier is 2   
Gamma of the Gaussian kernel for 2nd classifier is 1.388
Gamma of the Gaussian kernel for 3rd classifier is 1.388

For L1-penalized SVM:
The weight of L1-penalized SVM for 1st classifier is 100
The weight of L1-penalized SVM for 2nd classifier is 10
The weight of L1-penalized SVM for 3rd classifier is 10

For L1-penalized SVM with SMOTE:
The weight of L1-penalized SVM for 1st classifier is 10
The weight of L1-penalized SVM for 2nd classifier is 1000
The weight of L1-penalized SVM for 3rd classifier is 100

The exact match score for SVM SVC is better than L1-penalized SVM. After applying SMOTE technique the Exact match score of L1-penalized SVM gets reduced as expected. 
The Hamming Loss of SVM SVC is less than L1_penalized SVM and L1-penalized SVM with SMOTE technique has more Hamming Loss compared to L1-penalized SVM without SMOTE. 

b(v) Solving the multiclass multilabel classification problem using classifer chain method.

In [36]:
#Extra Practice 

# Classifier chain method on SVM SVC

#For all labels
best_score1 = 0
best_score2 = 0
best_score3 = 0
for log_c in range(-3,4):
    sigma = .1
    while sigma<=2:  
        g = 1/(2*(sigma**2))
        c = 10**log_c
        clfr = svm.SVC(C=c, decision_function_shape='ovr', gamma=g)
        accuracy1 = cross_val_score(clfr, X_train, y1_trnc, cv=10, scoring='accuracy').mean()
        accuracy2 = cross_val_score(clfr, X_traincc1, y2_trnc, cv=10, scoring='accuracy').mean()
        accuracy3 = cross_val_score(clfr, X_traincc2, y3_trnc, cv=10, scoring='accuracy').mean()
        sigma = sigma + .1
        if accuracy1 > best_score1:
            best_score1 = accuracy1
            best_c1 = c
            best_gamma1 = g
        if accuracy2 > best_score2:
            best_score2 = accuracy2
            best_c2 = c
            best_gamma2 = g
        if accuracy3 > best_score3:
            best_score3 = accuracy3
            best_c3 = c
            best_gamma3 = g
        
print('The weight of SVM penalty for 1st classifier:', best_c1)
print('The width of the Gaussian kernel for 1st classifier is:', best_gamma1)

print('The weight of SVM penalty for 2nd classifier is:', best_c2)
print('The width of the Gaussian kernel for 2nd classifier is:', best_gamma2)

print('The weight of SVM penalty for 3rd classifier is:', best_c3)
print('The width of the Gaussian kernel for 3rd classifier is:', best_gamma3)

The weight of SVM penalty for 1st classifier: 100
The width of the Gaussian kernel for 1st classifier is: 2.0
The weight of SVM penalty for 2nd classifier is: 10
The width of the Gaussian kernel for 2nd classifier is: 0.41322314049586784
The weight of SVM penalty for 3rd classifier is: 10
The width of the Gaussian kernel for 3rd classifier is: 0.3472222222222222


In [84]:
X_testcc = np.array(X_test)
#Training for label1
clfr = svm.SVC(C=best_c1, decision_function_shape='ovr', gamma=best_gamma1)
clfr.fit(X_train, y1_trnc)

#Prediction for label1
y1pred = clfr.predict(X_testcc)
y1pred = np.array([y1pred])
y1pred = np.transpose(y1pred)
#print(y1pred)

X_testcc1 = np.hstack((X_testcc, y1pred))
#Training for label2
clfr = svm.SVC(C=best_c2, decision_function_shape='ovr', gamma=best_gamma2)
clfr.fit(X_traincc1, y2_trnc)

#Prediction for label2
y2pred = clfr.predict(X_testcc1)
y2pred = np.array([y2pred])
y2pred = np.transpose(y2pred)

X_testcc2 = np.hstack((X_testcc1, y2pred))

#Training for label3
clfr = svm.SVC(C=best_c3, decision_function_shape='ovr', gamma=best_gamma3)
clfr.fit(X_traincc2, y3_trnc)

#Prediction for label3
y3pred = clfr.predict(X_testcc2)

#Dataframe of predicted labels
y1pred = pd.DataFrame(y1pred, columns= [0])
y2pred = pd.DataFrame(y2pred, columns= [1])
y3pred = pd.DataFrame(y3pred, columns= [2])

ypred = pd.concat([y1pred, y2pred, y3pred], axis=1)

In [85]:
#Evaluating the classifier

#Hamming loss
hmloss = hamming_score(y_tstc, ypred)
print('The Hamming loss for svm with gaussian kernel is', hmloss)

#Exact match
exactscore = exact_match(y_tstc, ypred)
print('The exact match for svm with gaussian kernel is', exactscore)

The Hamming loss for svm with gaussian kernel is 0.009177774346834092
The exact match for svm with gaussian kernel is 0.9916628068550255
