# Multi-class and Multi-Label Classification Using Support Vector Machines

### Sitao Min

### (a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29#. Choose 70% of the data randomly as the training set.

In [6]:
data2 = pd.read_csv('./Frogs_MFCCs.csv')
data2.shape

(7195, 26)

In [7]:
train_data = resample(data2, n_samples = int(data2.shape[0]*0.7) ,replace = False, random_state = 42)
test_data =  data2.drop(train_data.index.values, axis = 0)

print(train_data.shape)
print(test_data.shape)

(5036, 26)
(2159, 26)


In [8]:
train_data.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
3340,1.0,0.147506,0.014843,0.433466,0.169581,0.053135,-0.15202,-0.102427,0.153061,0.128628,...,0.285756,0.166246,-0.061206,-0.191878,0.003221,0.255371,Leptodactylidae,Adenomera,AdenomeraHylaedactylus,21
5108,1.0,0.674174,0.76752,0.392177,0.035669,0.181548,-0.013677,0.187262,0.144679,-0.424524,...,-0.157943,-0.048913,-0.016129,0.1292,0.011245,-0.275909,Hylidae,Hypsiboas,HypsiboasCinerascens,36
6524,1.0,-0.007777,0.363276,0.413224,0.196296,0.222389,0.097128,-0.155608,-0.035013,0.133689,...,0.125169,0.044882,-0.013309,-0.026086,-0.088825,-0.018968,Hylidae,Hypsiboas,HypsiboasCordobae,43
3649,1.0,0.429359,0.297881,0.60912,0.263991,-0.030414,-0.160778,0.076217,0.285909,0.052029,...,0.23033,0.069015,-0.101196,-0.152351,0.039065,0.233823,Leptodactylidae,Adenomera,AdenomeraHylaedactylus,22
1617,1.0,0.190599,0.062234,0.617262,0.208825,0.027108,-0.214441,0.009654,0.299053,0.039013,...,0.321598,0.128742,-0.152977,-0.223482,0.016131,0.250754,Leptodactylidae,Adenomera,AdenomeraHylaedactylus,15


In [15]:
X_train = train_data.iloc[:,0:-4]
X_test = test_data.iloc[:,0:-4]
y_train = train_data.iloc[:,-4:-1]
y_test = test_data.iloc[:,-4:-1]

### (b) Train a classifier for each label.


### i. Exact match and hamming score/ loss methods for evaluating multilabel classification

**Hamming loss**: the fraction of the wrong labels to the total number of labels,the optimal value is zero.

**Exact match (also called Subset accuracy)**: is the most strict metric, indicating the percentage of samples that have all their labels classified correctly.

**Usage**: we can use `sklearn.metrics.hamming_loss` and `sklearn.metrics.accuracy_score`

### ii. Train a SVM for each of the labels

In [16]:
def find_param_bound(X_train,y_train,gamma_bound1,gamma_bound2,C_bound1,C_bound2,threshold = 0.9, verbose = False):
    
    if(C_bound1 > C_bound2):
        C_bound = C_bound2+1
    else:
        C_bound = C_bound2-1
    
    gamma_bound = gamma_bound2
    
    for gamma in np.linspace(gamma_bound1,gamma_bound2,5,endpoint = True):
        for C_quant in np.linspace(C_bound1,C_bound2,abs(C_bound1-C_bound2)+1,endpoint = True):
            C = 10**C_quant
            svm = SVC(kernel="rbf",random_state = 0, C = C, gamma = gamma)
            svm.fit(X_train,y_train)
            y_pred = svm.predict(X_train)
            accuracy = accuracy_score(y_train,y_pred)
            
            if(accuracy > threshold):
                if(verbose == True):
                    print('find C')
                C_bound = C_quant
                gamma_bound = gamma
                return gamma_bound,C_bound
        
    return gamma_bound,C_bound

In [26]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

def handling_svm(X_train, y_train,X_test, C_low, C_upper, gamma_low, gamma_upper):
     
    # Create my estimator and prepare the parameter grid dictionary
    params_dict = {"C": np.logspace(C_low, C_upper, 5), "gamma": np.linspace(gamma_low,gamma_upper, 5)}
    svm = SVC(kernel="rbf",random_state = 0)

    # Fit the grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv = 10,refit = True,scoring = 'accuracy')
    search.fit(X_train, y_train)
    svm = search.best_estimator_

    svm.fit(X_train,y_train)
    print('best score:', search.best_score_)
    print('best params:',search.best_params_ )
    y_pred = svm.predict(X_test)
    
    return y_pred, svm

In [27]:
results = []
y_pred = pd.DataFrame(np.zeros((y_test.shape[0],y_test.shape[1])))
for i in range(0,3):
    start = time.time()
    # parameters space
    gamma_low1,C_low1 = find_param_bound(X_train,y_train.iloc[:,i],0.001,2,-3,1)
    gamma_upper1,C_low2 = find_param_bound(X_train,y_train.iloc[:,i],12,6,-3,1)
    gamma_low2,C_upper1 = find_param_bound(X_train,y_train.iloc[:,i],0.001,2,3,0)
    gamma_upper2,C_upper2 = find_param_bound(X_train,y_train.iloc[:,i],12,6,3,0)
    
    C_low = min(C_low1,C_low2)
    C_upper = max(C_upper1,C_upper2)
    gamma_low = min(gamma_low1,gamma_low2)
    gamma_upper = max(gamma_upper1,gamma_upper2)
    
    print(gamma_low,gamma_upper,C_low,C_upper)

    y_pred_temp,svm = handling_svm(X_train,y_train.iloc[:,i],X_test,C_low,C_upper,gamma_low,gamma_upper)
    results.append(svm)
    y_pred.iloc[:,i] = y_pred_temp
    print('finished round %d, time %f s'%(i, time.time()-start))

0.001 12.0 -1.0 3.0
best score: 0.9928514694201748
best params: {'C': 10.0, 'gamma': 3.00075}
finished round 0, time 160.310310 s
0.001 12.0 -1.0 3.0
best score: 0.9912629070691025
best params: {'C': 10.0, 'gamma': 3.00075}
finished round 1, time 200.057313 s
0.001 12.0 -1.0 3.0
best score: 0.9906671961874504
best params: {'C': 10.0, 'gamma': 3.00075}
finished round 2, time 203.916199 s


In [46]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score

print('result:')
#print('Exact match accuracy:',accuracy_score(y_test,y_pred))
hamming_loss = np.sum(np.not_equal(y_test.values, y_pred.values))/float(y_test.values.size)
print('hamming loss:',hamming_loss)
exact_match = np.sum(np.all(np.not_equal(y_test.values, y_pred.values),axis = 1))/float(y_test.shape[0])
print('Exact match error:',exact_match)

result:
hamming loss: 0.009572332870155936
Exact match error: 0.00555812876331635


### iii.  Train L1-penalized SVMs.

In [52]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
y_train_std = y_train
y_test_std = y_test

In [65]:
def find_param_bound_l1(X_train,y_train,C_bound1,C_bound2,threshold = 0.8, verbose = False):
    
    if(C_bound1 > C_bound2):
        C_bound = C_bound2+1
    else:
        C_bound = C_bound2-1
    
    
    for C_quant in np.linspace(C_bound1,C_bound2,abs(C_bound1-C_bound2)+1,endpoint = True):
        C = 10**C_quant
        svm = LinearSVC(random_state = 0, C = C, penalty = 'l1',dual = False)
        svm.fit(X_train,y_train)
        y_pred = svm.predict(X_train)
        accuracy = accuracy_score(y_train,y_pred)
            
        if(accuracy > threshold):
            if(verbose == True):
                print('find C')
            C_bound = C_quant
            return C_bound
        
    return C_bound

In [299]:
def handling_svm_l1(X_train, y_train,X_test, C_low, C_upper):
     
    # Create my estimator and prepare the parameter grid dictionary
    params_dict = {"C": np.logspace(C_low,C_upper,10)}
    
    svm = LinearSVC(random_state = 0,penalty = 'l1',dual = False)

    # Fit the grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv = 10,refit = True,scoring = 'accuracy')
    search.fit(X_train, y_train)
    svm = search.best_estimator_

    svm.fit(X_train,y_train)
    print('best score:', search.best_score_)
    print('best params:',search.best_params_ )
    y_pred = svm.predict(X_test)
    
    return y_pred, svm

In [70]:
results_l1 = []
y_pred_std = pd.DataFrame(np.zeros((y_test_std.shape[0],y_test_std.shape[1])))
for i in range(0,3):
    start = time.time()
    # parameters space
    C_low = find_param_bound_l1(X_train_std,y_train_std.iloc[:,i],-3,1)
    C_upper = find_param_bound_l1(X_train_std,y_train_std.iloc[:,i],3,0)
    
    print(C_low,C_upper)
    
    y_pred_std.iloc[:,i],svm = handling_svm_l1(X_train_std,y_train_std.iloc[:,i],X_test_std,C_low,C_upper)
    results_l1.append(svm)
    print('finished round %d, time %f s'%(i, time.time()-start))

-3.0 3.0
best score: 0.9330818109610802
best params: {'C': 10.0}
finished round 0, time 167.026087 s
-3.0 3.0
best score: 0.9507545671167593
best params: {'C': 2.154434690031882}
finished round 1, time 265.972917 s
-2.0 3.0
best score: 0.9573073868149324
best params: {'C': 21.544346900318846}
finished round 2, time 326.772975 s


In [71]:
print('result l1-lineer kernel:')
#print('Exact match accuracy:',accuracy_score(y_test,y_pred))
hamming_loss_l1 = np.sum(np.not_equal(y_test_std.values, y_pred_std.values))/float(y_test_std.values.size)
print('hamming loss:',hamming_loss_l1)
exact_match_l1 = np.sum(np.all(np.not_equal(y_test_std.values, y_pred_std.values),axis = 1))/float(y_test_std.shape[0])
print('Exact match error:',exact_match_l1)

result l1-lineer kernel:
hamming loss: 0.04878801914466574
Exact match error: 0.020842982862436313


### iv. Remedy class imbalance.

In [302]:
def find_param_bound_res(X_train,y_train,C_bound1,C_bound2,threshold = 0.8, verbose = False):
    
    if(C_bound1 > C_bound2):
        C_bound = C_bound2+1
    else:
        C_bound = C_bound2-1
    
    
    for C_quant in np.linspace(C_bound1,C_bound2,abs(C_bound1-C_bound2)+1,endpoint = True):
        C = 10**C_quant
        svm = LinearSVC(random_state = 0, C = C, penalty = 'l1',dual = False,class_weight= 'balanced')
        svm.fit(X_train,y_train)
        y_pred = svm.predict(X_train)
        accuracy = accuracy_score(y_train,y_pred)
            
        if(accuracy > threshold):
            if(verbose == True):
                print('find C')
            C_bound = C_quant
            return C_bound
        
    return C_bound

In [303]:
def handling_svm_res(X_train, y_train,X_test, C_low, C_upper):
     
    # Create my estimator and prepare the parameter grid dictionary
    params_dict = {"C": np.logspace(C_low,C_upper,5)}
    
    svm = LinearSVC(random_state = 0,penalty = 'l1',dual = False,class_weight ='balanced')

    # Fit the grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv = 10,refit = True,scoring = 'accuracy')
    search.fit(X_train, y_train)
    svm = search.best_estimator_

    svm.fit(X_train,y_train)
    print('best score:', search.best_score_)
    print('best params:',search.best_params_ )
    y_pred = svm.predict(X_test)
    
    return y_pred, svm

In [None]:
results_res = []
y_pred_res = pd.DataFrame(np.zeros((y_test_std.shape[0],y_test_std.shape[1])))
for i in range(0,3):
    start = time.time()
    # parameters space
    X_res_train, y_res_train = X_train_std, y_train_std.iloc[:,i]
    X_res_test, y_res_test = X_test_std, y_test_std.iloc[:,i]
    C_low = find_param_bound_res(X_res_train,y_res_train,-3,1)
    C_upper = find_param_bound_res(X_res_train,y_res_train,3,0)
    
    print(C_low,C_upper)
    
    y_pred_res_item,svm = handling_svm_res(X_res_train,y_res_train,X_res_test,C_low,C_upper)
    results_res.append(svm)
    y_pred_res.iloc[:,i] = y_pred_res_item
    print('finished round %d, time %f s'%(i, time.time()-start))

In [None]:
print('result resample:')
#print('Exact match accuracy:',accuracy_score(y_test,y_pred))
hamming_loss_res = np.sum(np.not_equal(y_pred_res.values, y_std_res.values))/float(y_pred_res.values.size)
print('hamming loss res:',hamming_loss_res)
exact_match_smote = np.sum(np.all(np.not_equal(y_pred_res.values, y_std_res.values),axis = 1))/float(y_pred_res.shape[0])
print('Exact match error res:',exact_match_res)