In [1]:
%autosave 60
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate
import sklearn.metrics
from sklearn.metrics import *
from sklearn.pipeline import make_pipeline

import os

import pickle
import pandas as pd

Autosaving every 60 seconds
Automatically created module for IPython interactive environment


In [2]:
DATA_DIR = "E:/Wasif/PDMotorFeatureExtraction/"
dataset_file = "tas2_severity_dataset.pkl"

SEED = 1234
#TRAIN_TEST_SPLIT = [0.80, 0.20]
TRAIN_TEST_DEV_SPLIT = [0.75, 0.15, 0.10]
np.random.seed(seed=SEED)

In [3]:
def load_dataset():
    dataset = pd.read_pickle(os.path.join(DATA_DIR, dataset_file))
    dataset = dataset[(~dataset["Right"].isna()) & (~dataset["Left"].isna())]
    dataset["rating"] = dataset[["Left", "Right"]].max(axis=1)
    
    #Duplicate a data point for 4 rating 3 times == to do oversampling, remove later
    #temp = dataset[dataset["rating"]==4.0]
    #dataset = dataset.append([temp]*3,ignore_index=True)
    
    X_t = dataset[["frequency_components"]].values
    X = np.zeros((X_t.shape[0], X_t[0][0].shape[0]))
    i = 0
    for x in X_t:
        X[i] = x[0]
        i +=1
    y = dataset["rating"].to_numpy()
    return (X,y)

In [4]:
def SMOTE_oversample(X,y):
    oversample = SMOTE(k_neighbors=3)
    X, y = oversample.fit_resample(X, y)
    return (X,y)

In [5]:
(X, y) = load_dataset()
#(X, y) = SMOTE_oversample(X,y)
print(X.shape)
print(y.shape)

#print(y)

(179, 128)
(179,)


In [6]:
def dataset_property(X, y):
    class0 = np.sum(y==0.0)
    class1 = np.sum(y==1.0)
    class2 = np.sum(y==2.0)
    class3 = np.sum(y==3.0)
    class4 = np.sum(y==4.0)
    print(class0, class1, class2, class3, class4)
    return

In [7]:
def train(X_train, y_train, model):
    #(X,y) = SMOTE_oversample(X_train,y_train)
    (X, y) = (X_train, y_train)
    clf = make_pipeline(StandardScaler(), model)
    clf.fit(X, y)
    return clf

In [8]:
def evaluate(y, pred_y):
    performance = {}
    performance['keys'] = ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy']
    performance['accuracy'] = accuracy_score(y, pred_y)
    performance['auc-roc'] = roc_auc_score(y, pred_y)
    performance['f1'] = f1_score(y, pred_y)
    performance['precision'] = precision_score(y, pred_y)
    performance['recall'] = recall_score(y, pred_y)
    performance['balanced_accuracy'] = balanced_accuracy_score(y, pred_y)
    
    return performance

In [9]:
def train_test_split():
    (X, y, X_index) = load_dataset()
    N = X.shape[0]
    N_train = (int)(N*TRAIN_TEST_SPLIT[0])
    N_test = N - N_train
    
    #print(N, N_train, N_test)
    
    test_indices = np.random.choice(N, N_test, replace=False)
    train_indices = [i for i in range(0,N) if i not in test_indices]
    
    (X_train, y_train) = (X[train_indices,:], y[train_indices])
    (X_test, y_test) = (X[test_indices,:], y[test_indices])
    
    X_index_train = []
    X_index_test = []
    
    for i in train_indices:
        X_index_train.append(X_index[i])
        
    for i in test_indices:
        X_index_test.append(X_index[i])
    
    return (X_train, y_train, X_index_train, X_test, y_test, X_index_test)

In [10]:
#train_test_split()

In [11]:
def train_test_dev_split():
    (X, y) = load_dataset()
    N = X.shape[0]
    N_train = (int)(N*TRAIN_TEST_DEV_SPLIT[0])
    N_test = (int)(N*TRAIN_TEST_DEV_SPLIT[1])
    N_dev = N - N_train - N_test
    
    print("Train Test Dev")
    print(N_train, N_test, N_dev)
    
    test_dev_indices = np.random.choice(N, (N_test+N_dev), replace=False)
    
    test_indices = test_dev_indices[0:N_test]
    dev_indices = test_dev_indices[N_test:]
    train_indices = [i for i in range(0,N) if i not in test_dev_indices]
    
    (X_train, y_train) = (X[train_indices,:], y[train_indices])
    (X_test, y_test) = (X[test_indices,:], y[test_indices])
    (X_dev, y_dev) = (X[dev_indices,:], y[dev_indices])
    
    return (X_train, y_train, X_test, y_test, X_dev, y_dev)

In [12]:
#train_test_dev_split()

In [13]:
#(X, y) = load_dataset()
#assert(X.shape[0]==y.shape[0])

(X_train, y_train, X_test, y_test, X_dev, y_dev) = train_test_dev_split()
assert(X_train.shape[0]==y_train.shape[0])
print("Dataset Loaded")

Train Test Dev
134 26 19
Dataset Loaded


In [14]:
C_values = [100, 50, 25, 10, 1, 0.1]
#Kernels = ['rbf', 'poly', 'sigmoid']
Kernels = ['rbf', 'poly']

best_MSE = 100000
best_model = None
best_config = (C_values[0], Kernels[0])

for kernel in Kernels:
    
    dev_MSEs = []
    train_MSEs = []
    
    for c in C_values:
        
        print(kernel, c)
        model = sklearn.svm.SVR(C=c, kernel=kernel)
        
        model.fit(X_train, y_train)
        
        dev_preds = model.predict(X_dev)
        dev_MSE = mean_squared_error(y_dev,dev_preds)
        dev_MAE = mean_absolute_error(y_dev, dev_preds)
        dev_MSEs.append(dev_MSE)

        if dev_MSE<best_MSE:
            best_MSE = dev_MSE
            best_model = model
            best_config = (kernel, c)

        print("Dev set MSE, MAE")
        print(dev_MSE, dev_MAE)

        train_preds = model.predict(X_train)
        train_MSE = mean_squared_error(y_train, train_preds)
        train_MAE = mean_absolute_error(y_train, train_preds)
        train_MSEs.append(train_MSE)

        print("Train set MSE, MAE")
        print(train_MSE, train_MAE)

        print("\n===========\n")
        
    
    plt.plot(C_values, dev_MSEs, 'r--', C_values, train_MSEs, 'bs')
    plt.xlabel('C')
    plt.ylabel('Mean Squared Error')
    plt.legend(['Dev Set', 'Train Set'])
    plt.savefig(kernel+"_svr.jpg")
    plt.close()
    
# save the model to disk
filename = 'E:/Wasif/PDMotorFeatureExtraction/deepmag_svr_fullfit_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))
print("Best Hyperparameters")
print(best_config)

rbf 100
Dev set MSE, MAE
0.5504768314647414 0.5684983634118775
Train set MSE, MAE
0.44935004002319223 0.42283377164292646


rbf 50
Dev set MSE, MAE
0.5383674989470537 0.5761106909191419
Train set MSE, MAE
0.5202169910319906 0.4707130361821292


rbf 25
Dev set MSE, MAE
0.5536945538117554 0.5871787941948763
Train set MSE, MAE
0.582333649172706 0.5022230657202832


rbf 10
Dev set MSE, MAE
0.525198237779151 0.565160300402474
Train set MSE, MAE
0.6564761691674198 0.5339158546300166


rbf 1
Dev set MSE, MAE
0.4952096787095395 0.5564175808703012
Train set MSE, MAE
0.7655232548193227 0.5785747977554212


rbf 0.1
Dev set MSE, MAE
0.475164054308165 0.5416276825772883
Train set MSE, MAE
0.826657016291105 0.6059255083158177


poly 100
Dev set MSE, MAE
0.8900923640009782 0.6704935430485444
Train set MSE, MAE
0.5835647934366182 0.4660948965421785


poly 50
Dev set MSE, MAE
0.7236437859313254 0.6209390693419087
Train set MSE, MAE
0.6334471623458839 0.48246763724298697


poly 25
Dev set MSE, MAE
0.706

In [15]:
loaded_model = pickle.load(open(filename, 'rb'))
prediction = loaded_model.predict(X_test)
print(prediction.shape)

(26,)


In [16]:
print("Test Performance")
MSE = mean_squared_error(y_test,prediction)
MAE = mean_absolute_error(y_test, prediction)
print(MSE, MAE)

Test Performance
1.0288481014421338 0.6977819268531756
