In [20]:
# import libraries
import pandas as pd
import numpy as np
import scipy
from collections import OrderedDict
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

### We will work on Anuran Calls (MFCCs) Data Set which can be found here: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs).

In [21]:
dataset = pd.read_csv('../input/Frogs_MFCCs.csv')
dataset = dataset.drop(['RecordID'], axis = 1)
cols_to_transform = ['Family','Genus', 'Species']
Labels = dataset[cols_to_transform].astype('category')
Labels = Labels[cols_to_transform].apply(lambda x: x.cat.codes)
Features = dataset.iloc[:,0:22]

#split data into training set and test set with training set being 70% of the data
x_train, x_test, y_train, y_test = train_test_split(Features, Labels, train_size = 0.7, test_size = 0.3)
y_train1 = y_train['Family']
y_train2 = y_train['Genus']
y_train3 = y_train['Species']

In [22]:
dataset.groupby('Family',axis=0).count()

Unnamed: 0_level_0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Genus,Species
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Bufonidae,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68
Dendrobatidae,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542
Hylidae,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165,2165
Leptodactylidae,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420,4420


In [23]:
#initialize hyper parameters
C_range = np.logspace(-3, 6, 10) 
gamma_range = np.arange(0.1, 2.1, 0.1) 
param_grid = dict(gamma=gamma_range, C=C_range)

#perform cross validation on hyper parameters for each label using gaussian kernel svm 
gaussian_grid1 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
gaussian_grid2 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
gaussian_grid3 = RandomizedSearchCV(SVC(), param_distributions =param_grid, cv=10)
gaussian_grid1.fit(x_train, y_train1)
gaussian_grid2.fit(x_train, y_train2)
gaussian_grid3.fit(x_train, y_train3)

print("The best parameters for 1st classifier with gaussian kernel svm are %s with accuracy score of %0.2f"
      % (gaussian_grid1.best_params_, gaussian_grid1.best_score_))

print("The best parameters for 2nd classifier with gaussian kernel svm are %s with accuracy score of %0.2f"
      % (gaussian_grid2.best_params_, gaussian_grid2.best_score_))

print("The best parameters for 3rd classifier with gaussian kernel svm are %s with accuracy score of %0.2f"
      % (gaussian_grid3.best_params_, gaussian_grid3.best_score_))

The best parameters for 1st classifier with gaussian kernel svm are {'gamma': 1.3000000000000003, 'C': 1000.0} with accuracy score of 0.99
The best parameters for 2nd classifier with gaussian kernel svm are {'gamma': 1.8000000000000003, 'C': 1000000.0} with accuracy score of 0.99
The best parameters for 3rd classifier with gaussian kernel svm are {'gamma': 1.6, 'C': 10000.0} with accuracy score of 0.99


In [24]:
#using best parameter values found above perform perdiction on each of the 3 labels
clf = SVC(C=gaussian_grid1.best_params_['C'], gamma=gaussian_grid1.best_params_['gamma'])
clf.fit(x_train, y_train1)
y_pred1 = clf.predict(x_test)

clf = SVC(C=gaussian_grid2.best_params_['C'], gamma=gaussian_grid2.best_params_['gamma'])
clf.fit(x_train, y_train2)
y_pred2 = clf.predict(x_test)

clf = SVC(C=gaussian_grid3.best_params_['C'], gamma=gaussian_grid3.best_params_['gamma'])
clf.fit(x_train, y_train3)
y_pred3 = clf.predict(x_test)

data = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',
                                                              y_pred3)])
gaussian_y_pred = pd.DataFrame.from_dict(data)

#### We will use Exact match and hamming score/ loss methods for evaluating multi-label classification

In [25]:
def hamming_loss(ytest, ypred):
    """
    function to calculate hamming score for given test and predicted values
    """
    hamm_loss = 0
    for i in range(0, len(ytest)):
        losscore = 0
        for j in range(0, 3):
            if ytest.iloc[i, j] != ypred.iloc[i, j]:
                losscore += 1
            losscore = losscore / 3
            hamm_loss += losscore
    hamm_loss /=  (len(ytest))
    return (hamm_loss)

def exact_match(ytest, ypred):
    """
    function to calculate exact match for given test and predicted values
    """
    exactmatch = 0
    for i in range(len(ypred)):
        if (ytest.iloc[i, 0] - ypred.iloc[i, 0] == 0
                and ytest.iloc[i, 1] - ypred.iloc[i, 1] == 0
                and ytest.iloc[i, 2] - ypred.iloc[i, 2] == 0):
            exactmatch += 1
    exactmatch /=(len(ypred))
    return (exactmatch)

In [26]:
#Evaluating the classifier with gaussian kernel
gaussiansvm_hamm_loss = hamming_loss(y_test, gaussian_y_pred)
print('The Hamming loss for gaussian kernel svm: ', gaussiansvm_hamm_loss)

gaussiansvm_exact_score = exact_match(y_test, gaussian_y_pred)
print('The exact match for gaussian kernel svm: ', gaussiansvm_exact_score)

The Hamming loss for gaussian kernel svm:  0.011064793371416798
The exact match for gaussian kernel svm:  0.9870310328855951


#### Let's try above with L1-penalized SVMs

In [27]:
#initialize hyper parameters
C_range = np.logspace(-3, 6, 10) 
param_grid = dict(C=C_range)

#perform cross validation on hyper parameters for each label using L1 penalty svm 
clf = LinearSVC(penalty='l1', dual=False)
linear_grid1 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
linear_grid2 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
linear_grid3 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
linear_grid1.fit(x_train, y_train1)
linear_grid2.fit(x_train, y_train2)
linear_grid3.fit(x_train, y_train3)

print("The best parameters for 1st classifier with L1 penalty svm are %s with accuracy score of %0.2f"
      % (linear_grid1.best_params_, linear_grid1.best_score_))

print("The best parameters for 2nd classifier with L1 penalty svm are %s with accuracy score of %0.2f"
      % (linear_grid2.best_params_, linear_grid2.best_score_))

print("The best parameters for 3rd classifier with L1 penalty svm are %s with accuracy score of %0.2f"
      % (linear_grid3.best_params_, linear_grid3.best_score_))

The best parameters for 1st classifier with L1 penalty svm are {'C': 10.0} with accuracy score of 0.93
The best parameters for 2nd classifier with L1 penalty svm are {'C': 1000000.0} with accuracy score of 0.95
The best parameters for 3rd classifier with L1 penalty svm are {'C': 100.0} with accuracy score of 0.96


In [29]:
#using best parameter values found above perform perdiction on each of the 3 labels
linear_clf = LinearSVC(
    penalty='l1',
    dual=False,
    C=linear_grid3.best_params_['C'],
)
linear_clf.fit(x_train, y_train1)
y_pred1 = linear_clf.predict(x_test)

linear_clf = LinearSVC(penalty='l1', dual=False, C=linear_grid2.best_params_['C'])
linear_clf.fit(x_train, y_train2)
y_pred2 = linear_clf.predict(x_test)

linear_clf = LinearSVC(penalty='l1', dual=False, C=linear_grid3.best_params_['C'])
linear_clf.fit(x_train, y_train3)
y_pred3 = linear_clf.predict(x_test)

data = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',
                                                              y_pred3)])
linear_y_pred = pd.DataFrame.from_dict(data)


#Hamming loss
linearsvm_hamm_loss = hamming_loss(y_test, linear_y_pred)
print('The Hamming loss for svm with L1 penalty: ', linearsvm_hamm_loss)

#Exact match
linearsvm_exact_score = exact_match(y_test, linear_y_pred)
print('The exact match for svm with L1 penalty: ', linearsvm_exact_score)

The Hamming loss for svm with L1 penalty:  0.06450174120391818
The exact match for svm with L1 penalty:  0.920796665122742


#### As seen from the dataset, there seems to be class imbalance. Let's try to remedy that using SMOTE

In [30]:
#perform SMOTE
svm_smote = SMOTE()
x_sampled_train1, y_sampled_train1 = svm_smote.fit_sample(x_train, y_train1)
x_sampled_train2, y_sampled_train2 = svm_smote.fit_sample(x_train, y_train2)
x_sampled_train3, y_sampled_train3 = svm_smote.fit_sample(x_train, y_train3)

#initialize hyper parameters
C_range = np.logspace(-3, 6, 10) 
param_grid = dict(C=C_range)

#perform cross validation on hyper parameters for each label using L1 penalty svm 
clf = LinearSVC(penalty='l1', dual=False)
smote_grid1 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
smote_grid2 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
smote_grid3 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
smote_grid1.fit(x_sampled_train1, y_sampled_train1)
smote_grid2.fit(x_sampled_train2, y_sampled_train2)
smote_grid3.fit(x_sampled_train3, y_sampled_train3)

print("The best parameters for 1st classifier with SMOTE L1 penalty svm are %s with accuracy score of %0.2f"
      % (smote_grid1.best_params_, smote_grid1.best_score_))

print("The best parameters for 2nd classifier with SMOTE penalty svm are %s with accuracy score of %0.2f"
      % (smote_grid2.best_params_, smote_grid2.best_score_))

print("The best parameters for 3rd classifier with SMOTE penalty svm are %s with accuracy score of %0.2f"
      % (smote_grid3.best_params_, smote_grid3.best_score_))

The best parameters for 1st classifier with SM OTE L1 penalty svm are {'C': 100.0} with accuracy score of 0.95
The best parameters for 2nd classifier with SMOTE penalty svm are {'C': 10.0} with accuracy score of 0.96
The best parameters for 3rd classifier with SMOTE penalty svm are {'C': 1000.0} with accuracy score of 0.96


In [31]:
#using best parameter values found above perform perdiction on each of the 3 labels

smote_clf = LinearSVC(penalty='l1', dual=False, C=smote_grid1.best_params_['C'])
smote_clf.fit(x_sampled_train1, y_sampled_train1)
y_pred1 = smote_clf.predict(x_test)

smote_clf = LinearSVC(penalty='l1', dual=False, C=smote_grid2.best_params_['C'])
smote_clf.fit(x_sampled_train2, y_sampled_train2)
y_pred2 = smote_clf.predict(x_test)

smote_clf = LinearSVC(penalty='l1', dual=False, C=smote_grid3.best_params_['C'])
smote_clf.fit(x_sampled_train3, y_sampled_train3)
y_pred3 = smote_clf.predict(x_test)

data = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',
                                                              y_pred3)])
smote_y_pred = pd.DataFrame.from_dict(data)

#Hamming loss
smote_hamm_loss = hamming_loss(y_test, smote_y_pred)
print('The Hamming loss for svm with smote: ', smote_hamm_loss)

#Exact match
smote_exact_score = exact_match(y_test, smote_y_pred)
print('The exact match for svm with smote: ',smote_exact_score)

The Hamming loss for svm with smote:  0.0882095620400399
The exact match for svm with smote:  0.8666049096804076


We see that, SVM with gaussian kernel has better exact match or low hamming loss compared to L1 penalized SVM. However on applying SMOTE the exact match reduces or hamming loss increases for L1 penalized SVM

#### Let's now try to solve this multi-label classification using Classifier Chain method (For more info - https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/)

In [32]:
x_train_cchain1 = x_train
x_train_cchain2 = pd.concat([x_train, y_train1], axis = 1)
x_train_cchain3 = pd.concat([x_train, y_train1,y_train2], axis = 1)

#initialize hyper parameters
C_range = np.logspace(-3, 6, 10) 
param_grid = dict(C=C_range)

#perform cross validation on hyper parameters for each label using L1 penalty svm 
clf = LinearSVC(penalty='l1', dual=False)
classifer_grid1 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
classifer_grid2 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)
classifer_grid3 = RandomizedSearchCV(clf, param_distributions =param_grid, cv=10)

classifer_grid1.fit(x_train_cchain1, y_train1)
classifer_grid2.fit(x_train_cchain2, y_train2)
classifer_grid3.fit(x_train_cchain3, y_train3)

print(
    "The best parameter for 1st classifier using classifier chain with L1 Penalty SVM is C: %s with accuracy score of %0.2f"
    % (classifer_grid1.best_params_, classifer_grid1.best_score_))

print(
    "The best parameter for 2nd classifier using classifier chain  with L1 Penalty SVM is C: %s with accuracy score of %0.2f"
    % (classifer_grid2.best_params_, classifer_grid2.best_score_))

print(
    "The best parameter for 3rd classifier using classifier chain  with L1 Penalty SVM is C: %s with accuracy score of %0.2f"
    % (classifer_grid3.best_params_, classifer_grid3.best_score_))

The best parameter for 1st classifier using classifier chain with L1 Penalty SVM is C: {'C': 10.0} with accuracy score of 0.94
The best parameter for 2nd classifier using classifier chain  with L1 Penalty SVM is C: {'C': 10.0} with accuracy score of 0.98
The best parameter for 3rd classifier using classifier chain  with L1 Penalty SVM is C: {'C': 1000.0} with accuracy score of 0.99


In [33]:
y_test1 = y_test['Family']
y_test2 = y_test['Genus']
y_test3 = y_test['Species']

#creating feature set for classifier chain
x_test_cchain1 = x_test
x_test_cchain2 = pd.concat([x_test, y_test1], axis = 1, ignore_index=False)
x_test_cchain3 = pd.concat([x_test, y_test1,y_test2], axis = 1, ignore_index=False)

cchain_clf = LinearSVC(
    penalty='l1',
    dual=False,
    C=classifer_grid1.best_params_['C'],
)
cchain_clf.fit(x_train_cchain1, y_train1)
y_pred1 = cchain_clf.predict(x_test_cchain1)

cchain_clf = LinearSVC(penalty='l1', dual=False, C=classifer_grid2.best_params_['C'])
cchain_clf.fit(x_train_cchain2, y_train2)
y_pred2 = cchain_clf.predict(x_test_cchain2)

cchain_clf = LinearSVC(penalty='l1', dual=False, C=classifer_grid3.best_params_['C'])
cchain_clf.fit(x_train_cchain3, y_train3)
y_pred3 = cchain_clf.predict(x_test_cchain3)

data = OrderedDict([('Family', y_pred1), ('Genus', y_pred2), ('Species',
                                                              y_pred3)])
cchain_y_pred = pd.DataFrame.from_dict(data)


#Hamming loss
cchain_hamm_loss = hamming_loss(y_test, cchain_y_pred)
print('The Hamming loss for classifier chain with L1 penalty SVM: ', cchain_hamm_loss)

#Exact match
cchain_exact_score = exact_match(y_test, cchain_y_pred)
print('The exact match for classifier chain with L1 penalty SVM: ', cchain_exact_score)

The Hamming loss for classifier chain with L1 penalty SVM:  0.040656682620554864
The exact match for classifier chain with L1 penalty SVM:  0.9286706808707735
