# CAP6610 Machine Learning 
## Project 2 

### Group 8
- Rachit Ranjan 
- Jun Deng
- Zhitao Liu
- Pu Fang 
- Ashutosh Garf

### Imports

In [1]:
import scipy.io as scio
import matplotlib.pyplot as plt
import numpy as np
import random
from warnings import filterwarnings
from datetime import datetime
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from skbayes.rvm_ard_models import RVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.externals import joblib
filterwarnings("ignore")

### Load Data

In [2]:
# Paste .mat files in the same directory and modify the names here 

# Location + Name of .mat file containing Training Data 
train_data_loc = 'Proj2FeatVecsSet1.mat' 
# Location + Name of .mat file containing Output Labels
output_data_loc = 'Proj2TargetOutputsSet1.mat'

train_data = scio.loadmat(train_data_loc)[train_data_loc.split('.')[0]]
output_labels = scio.loadmat(output_data_loc)[output_data_loc.split('.')[0]]

print(train_data.shape, output_labels.shape)

(25000, 60) (25000, 5)


### Data Preprocessing
- Encoding Output Labels to single value 
- Adding Unknown Class Data and Label

In [3]:
def encode_output_labels(output_labels=output_labels):
    "Encodes output labels to a single value"
    out_labels = []
    for row in output_labels:
        out_labels.append(np.where(row == 1)[0][0])
    return np.array(out_labels)

def generate_unknown_class_data(entries=10000):
    "Returns Data and Output Labels for Unknown Class(Nc+1)"
    unknown_class_data = []
    for i in range(entries):
        rand = []
        for j in range(train_data.shape[1]):
            rand.append(random.random())
        unknown_class_data.append(rand)
    unknown_class_label = np.full((10000), 5, dtype=int)
    return np.array(unknown_class_data), unknown_class_label

In [4]:
# Encode Output Lables to a single value
out_labels = encode_output_labels(output_labels=output_labels)

# Generate Data for Training Unknown Class Nc+1
unknown_class_data, unknown_class_labels = generate_unknown_class_data(entries=10000)

# Merge Actual and Generated Class Nc+1 Training Data and
train_data = np.concatenate((train_data, unknown_class_data))
out_labels = np.concatenate((out_labels, unknown_class_labels))

print(train_data.shape, out_labels.shape)

(35000, 60) (35000,)


### Classification Algorithms

In [5]:
# Default train and test sets 
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(train_data, out_labels,
                                                                   test_size=0.33,
                                                                   shuffle=True)
x_train_def = x_train_def[:500]
x_test_def = x_test_def[:500]
y_train_def = y_train_def[:500]
y_test_def = y_test_def[:500]




def svm_classify(x_train=x_train_def,y_train=y_train_def,
                 x_test=x_test_def, y_test=y_test_def,
                 kernel='rbf',gamma='auto', probability=True, decision_function_shape='ovr', degree=3,
                 tol=1e-3):
    """Trains a Support Vector Machine Classifer"""
    # Create Model with passed hyperparameters
    svc = SVC(kernel=kernel,
              gamma=gamma,
              degree=degree, # Ignored unless a polynomial kernel function is used
              probability=probability,
              decision_function_shape=decision_function_shape,
              tol=tol,
              random_state=42)
    
    # Train Model 
    svc.fit(x_train,y_train)
    
    # Return Trained Model and Accuracy on Test Data
    return svc, svc.score(x_test,y_test), 



def rvm_classify(x_train=x_train_def,y_train=y_train_def,
                 x_test=x_test_def, y_test=y_test_def,
                 kernel='rbf', degree=3,n_iter=100,tol=0.001):
    """Trains a Relevance Vector Machine Classifier"""
    
    rvm = RVC(kernel=kernel,
              degree=degree,
              n_iter=n_iter,
              tol=tol)
    
    # Train Model 
    rvm.fit(x_train,y_train)
    
    # Return Trained Model and Accuracy on Test Data
    return rvm, (rvm.predict(x_test) == y_test).sum()/100.0



def gp_classify(x_train=x_train_def,y_train=y_train_def,
                x_test=x_test_def, y_test=y_test_def,
                kernel=1.0 * RBF(1.0), optimizer='fmin_l_bfgs_b',
                n_restarts_optimizer=0, max_iter_predict=100,
                warm_start=True,multi_class='one_vs_one'):
    """Trains a Gaussian Process Classifier"""
    
    gpc = GaussianProcessClassifier(kernel=kernel,
                                   optimizer=optimizer,
                                   n_restarts_optimizer=n_restarts_optimizer,
                                   max_iter_predict=max_iter_predict,
                                   warm_start=warm_start,
                                   multi_class=multi_class,
                                   n_jobs=-1)
    
    # Train Model 
    gpc.fit(x_train, y_train)

    # Return Trained Model and Accuracy on Test Data
    return gpc, gpc.score(x_test,y_test)



## TrainMyClassifier 

In [6]:
def TrainMyClassifer(XEstimate, XValidate, Parameters):
    """Prints Output Labels and Cross Validated Learned Hyper Parameters on SVM, RVM and GPC """
    (SVMY_Pred,RVMY_Pred,GPY_Pred,cvsvm.best_params_,cvrvm.best_params_,cvgpc.best_params_,
     SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix) = MyCrossValidate(XEstimate, XValidate, 5)
    
    print ('SVM Prdicted Labels:\n%r\nRVM Predicted Labels:\n%r\nGPC Predicted Labels:\n%r\nSVM Best Params:\n%r\nRVM Best Params:\n%r\nGPC Best Params:\n%r\n'%(SVMY_Pred,RVMY_Pred,GPY_Pred,cvsvm.best_params_,cvrvm.best_params_,cvgpc.best_params_))

# Cross Validation 
### K Fold Separation

In [7]:
#Funtion K_Fold_Seperation(Xtrain,Ytrain,Nf)
#    Use K_Fold to seperate the training set into Nf parts, take Nf-1 parts as Estimation set
#    Take the remaining 1 as Validation set
#    Inputs:
#           Xtrain: Training Dataset Feature Vectors
#           Ytrain: Training Dataset Label Vectors
#           Nf:     Number of Folds for K_Fold Cross Validation
#    Outputs:
#           x_est_def: Estimation Sets Feature Vectors, Size = 5
#           x_val_def: Validation Sets Feature Vectors, Size = 5
#           y_est_def: Estimation Sets Label Vectors, Size = 5
#           y_val_def: Validation Sets Label Vectors, Size = 5
def K_Fold_Seperation(Xtrain=x_train_def,Ytrain=y_train_def,Nf=5):
    "Separate the Training Data into Nf Folds and Generate Estimation and Validation Sets"
    x_est_def = []
    x_val_def = []
    y_est_def = []
    y_val_def = []

    kf = KFold(n_splits=Nf)
    for train,test in kf.split(Xtrain):
        x_est_def.append(Xtrain[train])
        x_val_def.append(Xtrain[test])
        y_est_def.append(Ytrain[train])
        y_val_def.append(Ytrain[test])

    x_est_def = np.array(x_est_def)
    x_val_def = np.array(x_val_def)
    y_est_def = np.array(y_est_def)
    y_val_def = np.array(y_val_def)
    return x_est_def, x_val_def, y_est_def, y_val_def


# Apply K_Fold to Training Dataset Feature & Label Vectors
# Get Estimation Sets and Validation Sets
x_est_def, x_val_def, y_est_def, y_val_def = K_Fold_Seperation()
print(x_est_def.shape, x_val_def.shape, y_est_def.shape, y_val_def.shape)
print(x_test_def.shape, y_test_def.shape)



(5, 400, 60) (5, 100, 60) (5, 400) (5, 100)
(500, 60) (500,)


### Cross Validation Functions for all Classification Algorithms

In [8]:
#Funtion svm_cv,rvm_cv, and gp_cv(x_train, y_train, algorithm_parameters, Nf)
#    Use GridSearchCV and Cross Validation to Find the Best Fitted Model of Each Algorithm
#    The Returned Model Contains EstParameters
#    Inputs:
#           Xtrain: Training Dataset Feature Vectors
#           Ytrain: Training Dataset Label Vectors
#           algorithm_parameters: Candidate Parameters for Each Algorithm
#           Nf:     Number of Folds for K_Fold Cross Validation
#    Outputs:
#           cvsvm: Best Fitted Model of SVM
#           cvrvm: Best Fitted Model of RVM
#           cvgpc: Best Fitted Model of GPR

# Define parameter candidates to get the optimal SVM model
# svm_parameters = {'kernel':('linear', 'poly', 'rbf'), 'gamma':[0.125, 0.5, 1, 4], 'decision_function_shape':['ovo','ovr'],
#                  'tol':[1e-3, 5e-3, 1e-2]}
svm_parameters = {'gamma':[0.25, 1, 4], 'tol':[0.001,0.005]}
def svm_cv(x_train, y_train,
                 svm_parameters, Nf):
    """Trains a Support Vector Machine Classifer"""
    # Create Model with passed hyperparameters
    svc = SVC()
    cvsvm = GridSearchCV(svc, svm_parameters, cv=Nf)
    
    # Train Model 
    cvsvm.fit(x_train,y_train)
    
    # Return Trained Model and Accuracy on Test Data
    return cvsvm

# Define parameter candidates to get the optimal RVM model
# rvm_parameters = {'kernel':('linear', 'poly', 'rbf'), 'degree':[2, 3], 'n_iter':[100, 200], 'tol':[0.001,0.005]}
rvm_parameters = {'kernel':('linear', 'poly', 'rbf'), 'tol':[0.001,0.005]}
def rvm_cv(x_train,y_train,
                 rvm_parameters, Nf):
    """Trains a Relevance Vector Machine Classifier"""
    
    rvm = RVC()
    cvrvm = GridSearchCV(rvm, rvm_parameters, cv=Nf)
    
    # Train Model 
    cvrvm.fit(x_train,y_train)    
    
    # Return Trained Model and Accuracy on Test Data
    return cvrvm

# Define parameter candidates to get the optimal GPR model
# gp_parameters = {'n_restarts_optimizer':[0,1], 'max_iter_predict':[50, 100], 'warm_start':('True','False'),
#                'multi_class':('one_vs_rest','one_vs_one')}
gp_parameters = {'n_restarts_optimizer':[0,2], 'max_iter_predict':[50, 100]}
def gp_cv(x_train,y_train,
                gp_parameters, Nf):
    """Trains a Gaussian Process Classifier"""
    
    gpc = GaussianProcessClassifier()
    cvgpc = GridSearchCV(gpc, gp_parameters, cv=Nf)
    
    # Train Model 
    cvgpc.fit(x_train, y_train)

    # Return Trained Model and Accuracy on Test Data
    return cvgpc

## MyCrossValidate 

In [9]:
#Funtion MyCrossValidate(XTrain, YTrain, Nf)
#    Use cross validation to get the optimal parameters and hyper-parameters
#    Use the trained model to get confusion-matrix of each validation set
#    And the confusion-matrix for the whole training set
#    Return Ytrain, EstParameters, EstConfMatrices, and ConfMatrix
#    Inputs:
#           Xtrain: Training Dataset Feature Vectors
#           Ytrain: Training Dataset Label Vectors
#           Nf:     Number of Folds for K_Fold Cross Validation
#    Outputs:
#           SVMY_Pred,RVMY_Pred,GPY_Pred: Class labels for each fold, corresponds to Ytrain
#           cvsvm.best_params_,cvrvm.best_params_,cvgpc.best_params_: Best Parameters, corresponds to EstParameters
#           SVMEstConfMat,RVMEstConfMat,GPEstConfMat: Confusion Matrices for each fold, corresponds to EstConfMatrices
#           SVMConfMatrix,RVMConfMatrix,GPConfMatrix: Overall Confusion Matrix, corresponds to ConfMatrix
def MyCrossValidate(XTrain, YTrain, Nf):
    cvsvm = svm_cv(XTrain, YTrain, svm_parameters, Nf)
    cvrvm = rvm_cv(XTrain, YTrain, rvm_parameters, Nf)
    cvgpc = gp_cv(XTrain, YTrain, gp_parameters, Nf)
    
    SVMEstConfMat = []
    SVMY_Pred = []
    RVMEstConfMat = []
    RVMY_Pred = []
    GPEstConfMat = []
    GPY_Pred = []
    
    # Best Models for SVM and RVM acquired through Cross Validation
    bestsvm = cvsvm.best_estimator_
    bestrvm = cvrvm.best_estimator_
    
    for i in range(x_val_def.shape[0]):
        # For each validation sets, predict the labels and get its confusion matrix  
        svmtemp_pred = cvsvm.predict(x_val_def[i])
        svmtemp_conf = ConfusionMatrix(y_val_def[i], svmtemp_pred)[0]
        rvmtemp_pred = cvrvm.predict(x_val_def[i])
        rvmtemp_conf = ConfusionMatrix(y_val_def[i], rvmtemp_pred)[0]
        gptemp_pred = cvgpc.predict(x_val_def[i])
        gptemp_conf = ConfusionMatrix(y_val_def[i], gptemp_pred)[0]
        
        bestsvm.fit(x_est_def[i],y_est_def[i])
        bestsvm.predict(x_val_def[i])
        print('The Shape of Support Vectors Array of SVM model for Fold %d is:' % i )
        print(bestsvm.support_vectors_.shape)
        
        bestrvm.fit(x_est_def[i],y_est_def[i])
        bestrvm.predict(x_val_def[i])
        print('The Length of Relevance Vectors Array of RVM model for Fold %d is:' % i )
        print(len(bestrvm.relevant_vectors_))

        SVMEstConfMat.append(svmtemp_conf)
        SVMY_Pred.append(svmtemp_pred)
        RVMEstConfMat.append(rvmtemp_conf)
        RVMY_Pred.append(rvmtemp_pred)
        GPEstConfMat.append(gptemp_conf)
        GPY_Pred.append(gptemp_pred)
        
    SVMEstConfMat = np.array(SVMEstConfMat)
    SVMY_Pred = np.array(SVMY_Pred)
    RVMEstConfMat = np.array(RVMEstConfMat)
    RVMY_Pred = np.array(RVMY_Pred)
    GPEstConfMat = np.array(GPEstConfMat)
    GPY_Pred = np.array(GPY_Pred)
    
    YTrue = y_val_def.flatten()
    SVM_All_pred = SVMY_Pred.flatten()    
    SVMConfMatrix = ConfusionMatrix(YTrue, SVM_All_pred)[0]
    RVM_All_pred = RVMY_Pred.flatten()
    RVMConfMatrix = ConfusionMatrix(YTrue, RVM_All_pred)[0]
    GP_All_pred = GPY_Pred.flatten()
    GPConfMatrix = ConfusionMatrix(YTrue, GP_All_pred)[0]
    
    joblib.dump(cvsvm, "SVM.pkl")
    joblib.dump(cvrvm, "RVM.pkl")
    joblib.dump(cvgpc, "GPR.pkl")
    
    return SVMY_Pred,RVMY_Pred,GPY_Pred,cvsvm.best_params_,cvrvm.best_params_,cvgpc.best_params_,\
SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix



#Funtion Dis_CV_Info(SVM_best_params,RVM_best_params,GPR_best_params,
#SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix)
#    Display the info acquired through Cross Validation
#    Display EstParameters, EstConfMatrices, and ConfMatrix
#    Inputs:
#           SVM_best_params,RVM_best_params,GPR_best_params: EstParameters of Each Algorithm
#           SVMEstConfMat,RVMEstConfMat,GPEstConfMat: EstConfMatrices of Each Algorithm
#           SVMConfMatrix,RVMConfMatrix,GPConfMatrix: ConfMatrix of Each Algorithm
#    Outputs:
#           None
def Dis_CV_Info(SVM_best_params,RVM_best_params,GPR_best_params,
SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix):
    print('The parameters of the best SVM model are: ')
    print(SVM_best_params)
    print('The parameters of the best RVM model are: ')
    print(RVM_best_params)
    print('The parameters of the best GPR model are: ')
    print(GPR_best_params)
    
    for i in range(x_val_def.shape[0]):
        print('The Confusion Matrix For Fold %d is: ' % i)
        print('SVM:')
        print(SVMEstConfMat[i])
        print('RVM:')
        print(RVMEstConfMat[i])
        print('GPR:')
        print(GPEstConfMat[i])

    print('The Overall Confusion Matrix is: ')
    print('SVM:')
    print(SVMConfMatrix)
    print('RVM:')
    print(RVMConfMatrix)
    print('GPR:')
    print(GPConfMatrix)
    return 1


## TestMyClassifier

In [10]:
def TestMyClassifier(x_test, algorithm = 'SVM', model = None):
    """Load a Classifer and Return Predictions"""
    if model == None:
        if algorithm == 'SVM':
            model = joblib.load('SVM.pkl')
        elif algorithm == 'RVM':
            model = joblib.load('RVM.pkl')
        else:
            model = joblib.load('GPR.pkl')
    return model.predict(x_test)

## ConfusionMatrix

In [11]:
def ConfusionMatrix(Y, ClassNames): 
    """Prints out a Confusion Matrix between true and predicted labels
         Y             Actual Labels 
         ClassNames    Predicted Labels""" 
    con_mat=confusion_matrix(y_true=Y,y_pred=ClassNames)
    overall_accuracy=accuracy_score(y_true=Y,y_pred=ClassNames)
    acc_each_class=precision_score(y_true=Y,y_pred=ClassNames,average=None)
    average_accuracy=np.mean(acc_each_class)
    
    print("Confusion Matrix\n %r" % con_mat)
    print("Average Overall Accuracy%.5f\n" % average_accuracy)
    
    return con_mat, average_accuracy

### CV Test 

In [13]:
# SVM_Y,RVM_Y,GPR_Y corresponds to Ytrain
# SVM_best_params,RVM_best_params,GPR_best_params corresponds to EstParameters
# SVMEstConfMat,RVMEstConfMat,GPEstConfMat corresponds to EstConfMatrices
# SVMConfMatrix,RVMConfMatrix,GPConfMatrix corresponds to ConfMatrix
# Calculate and display the time taken for cross validation
start = datetime.now()
SVM_Y,RVM_Y,GPR_Y,SVM_best_params,RVM_best_params,GPR_best_params,\
SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix\
= MyCrossValidate(x_train_def, y_train_def, 5)

Dis_CV_Info(SVM_best_params,RVM_best_params,GPR_best_params,
SVMEstConfMat,RVMEstConfMat,GPEstConfMat,SVMConfMatrix,RVMConfMatrix,GPConfMatrix)
end = datetime.now()
print('Time for Cross Validation: ', end=' ')
print (end-start)

Confusion Matrix
 array([[17,  0,  0,  0,  0,  0],
       [ 0, 14,  0,  1,  0,  0],
       [ 0,  0, 15,  1,  0,  0],
       [ 1,  0,  0,  9,  0,  0],
       [ 0,  0,  1,  0,  9,  0],
       [ 0,  0,  0,  0,  0, 32]])
Average Overall Accuracy0.95002

Confusion Matrix
 array([[15,  0,  1,  0,  1,  0],
       [ 0, 10,  0,  5,  0,  0],
       [ 5,  0,  9,  2,  0,  0],
       [ 1,  3,  0,  6,  0,  0],
       [ 1,  0,  1,  2,  6,  0],
       [ 0,  0,  0,  0,  0, 32]])
Average Overall Accuracy0.75440

Confusion Matrix
 array([[17,  0,  0,  0,  0,  0],
       [ 0, 15,  0,  0,  0,  0],
       [ 1,  0, 14,  1,  0,  0],
       [ 1,  1,  0,  8,  0,  0],
       [ 0,  0,  1,  0,  9,  0],
       [ 0,  0,  0,  0,  0, 32]])
Average Overall Accuracy0.94241

The Shape of Support Vectors Array of SVM model for Fold 0 is:
(311, 60)
The Length of Relevance Vectors Array of RVM model for Fold 0 is:
6
Confusion Matrix
 array([[12,  0,  0,  2,  0,  0],
       [ 0, 16,  0,  1,  0,  0],
       [ 0,  0,  8,  0,  

## Test test

In [17]:
tmp_labels = TestMyClassifier(x_test_def, 'GPR')
print(np.sum(tmp_labels==y_test_def))

461
