In [1]:
%autosave 60
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate
import sklearn.metrics
from sklearn.metrics import *
from sklearn.pipeline import make_pipeline

import pickle

Autosaving every 60 seconds
Automatically created module for IPython interactive environment


In [2]:
DATA_DIR = "C:/Wasif/PD Motor Feature Extraction/TASK2_FEATURES_04_21/"
X_file = "x_repeat_removed_raw_pixels.npy"
y_file = "y_repeat_removed_raw_pixels.npy"
X_index_file = "index_repeat_removed.pickle"
SEED = 5543
TRAIN_TEST_SPLIT = [0.80, 0.20]
TRAIN_TEST_DEV_SPLIT = [0.70, 0.15, 0.15]
NUM_FOLDS = 10
np.random.seed(seed=SEED)

In [3]:
def load_dataset(DATA_DIR):
    X = np.load(DATA_DIR+X_file)
    #For now, only considering the frequency features
    #X = X[:,65536:]
    X = X[:,:65536]
    
    y = np.load(DATA_DIR+y_file)
    
    with open(DATA_DIR+X_index_file, 'rb') as handle:
        X_index = pickle.load(handle)  
    
    return (X, y, X_index)

In [4]:
def SMOTE_oversample(X,y):
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
    return (X,y)

In [5]:
def dataset_property(X, y):
    positives = np.sum(y==1.0)
    negatives = np.sum(y==0.0)
    print(positives, negatives)
    return

In [6]:
def train(X_train, y_train, model):
    #(X,y) = SMOTE_oversample(X_train,y_train)
    (X, y) = (X_train, y_train)
    clf = make_pipeline(StandardScaler(), model)
    clf.fit(X, y)
    return clf

In [7]:
def evaluate(y, pred_y):
    performance = {}
    performance['keys'] = ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy']
    performance['accuracy'] = accuracy_score(y, pred_y)
    performance['auc-roc'] = roc_auc_score(y, pred_y)
    performance['f1'] = f1_score(y, pred_y)
    performance['precision'] = precision_score(y, pred_y)
    performance['recall'] = recall_score(y, pred_y)
    performance['balanced_accuracy'] = balanced_accuracy_score(y, pred_y)
    
    return performance

In [8]:
def k_fold_CV(model, X, y, X_index, k):
    #k iterations
    #Split into k fold, (k-1) = train, last = test
    #Evaluate on test
    #Report average
    performance = {}
    performance['keys'] = ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy']
    performance['accuracy'] = 0.0
    performance['auc-roc'] = 0.0
    performance['f1'] = 0.0
    performance['precision'] = 0.0
    performance['recall'] = 0.0
    performance['balanced_accuracy'] = 0.0
    
    N = X.shape[0]
    samples_per_fold = (int)(N/k)
    
    X_folds = []
    y_folds = []
    X_index_folds = []
    
    shuffled_indices =np.arange(0,N) 
    np.random.shuffle(shuffled_indices)
    
    for fold_no in range(0,k):
        #fold_no set is the test set
        if fold_no==(k-1):
            test_indices = shuffled_indices[(samples_per_fold*fold_no): N]
        else:
            test_indices = shuffled_indices[(samples_per_fold*fold_no): (samples_per_fold*(fold_no+1))]
        
        train_indices = [i for i in range(0,N) if i not in test_indices]
        
        (X_train, y_train) = (X[train_indices,:], y[train_indices])
        (X_test, y_test) = (X[test_indices,:], y[test_indices])

        X_index_train = []
        X_index_test = []

        for i in train_indices:
            X_index_train.append(X_index[i])

        for i in test_indices:
            X_index_test.append(X_index[i])
        
        clf = train(X_train, y_train, model)
        
        #test performance using X_test, y_test
        fold_performance = evaluate(y_test, clf.predict(X_test))
        
        print("Fold %d"%(fold_no))
        print(fold_performance)
        
        for key in fold_performance['keys']:
            performance[key] +=fold_performance[key]
        
        #Show the samples with wrong predictions?
    
    for key in fold_performance['keys']:
            performance[key]  = performance[key]/k
    
    return performance

In [9]:
def train_test_split():
    (X, y, X_index) = load_dataset(DATA_DIR)
    N = X.shape[0]
    N_train = (int)(N*TRAIN_TEST_SPLIT[0])
    N_test = N - N_train
    
    #print(N, N_train, N_test)
    
    test_indices = np.random.choice(N, N_test, replace=False)
    train_indices = [i for i in range(0,N) if i not in test_indices]
    
    (X_train, y_train) = (X[train_indices,:], y[train_indices])
    (X_test, y_test) = (X[test_indices,:], y[test_indices])
    
    X_index_train = []
    X_index_test = []
    
    for i in train_indices:
        X_index_train.append(X_index[i])
        
    for i in test_indices:
        X_index_test.append(X_index[i])
    
    return (X_train, y_train, X_index_train, X_test, y_test, X_index_test)

In [10]:
#train_test_split()

In [11]:
def train_test_dev_split():
    (X, y, X_index) = load_dataset(DATA_DIR)
    N = X.shape[0]
    N_train = (int)(N*TRAIN_TEST_DEV_SPLIT[0])
    N_test = (int)(N*TRAIN_TEST_DEV_SPLIT[1])
    N_dev = N - N_train - N_test
    
    print(N_train, N_test, N_dev)
    
    test_dev_indices = np.random.choice(N, (N_test+N_dev), replace=False)
    
    test_indices = test_dev_indices[0:N_test]
    dev_indices = test_dev_indices[N_test:]
    train_indices = [i for i in range(0,N) if i not in test_dev_indices]
    
    (X_train, y_train) = (X[train_indices,:], y[train_indices])
    (X_test, y_test) = (X[test_indices,:], y[test_indices])
    (X_dev, y_dev) = (X[dev_indices,:], y[dev_indices])
    
    X_index_train = []
    X_index_test = []
    X_index_dev = []
    
    for i in train_indices:
        X_index_train.append(X_index[i])
        
    for i in test_indices:
        X_index_test.append(X_index[i])
        
    for i in dev_indices:
        X_index_dev.append(X_index[i])
    
    return (X_train, y_train, X_index_train, X_test, y_test, X_index_test, X_dev, y_dev, X_index_dev)

In [12]:
#train_test_dev_split()

In [13]:
(X, y, X_index) = load_dataset(DATA_DIR)
assert(X.shape[0]==y.shape[0] and y.shape[0]==len(X_index))
print("Dataset Loaded")

#train model using X_train, y_train
model = sklearn.svm.SVC(C=100, max_iter=10000)
performance = k_fold_CV(model, X, y, X_index, NUM_FOLDS)

print("\nOverall Performance")
print(performance)

Dataset Loaded
Fold 0
{'keys': ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy'], 'accuracy': 0.5256410256410257, 'auc-roc': 0.4649999999999999, 'f1': 0.27450980392156865, 'precision': 0.30434782608695654, 'recall': 0.25, 'balanced_accuracy': 0.465}
Fold 1
{'keys': ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy'], 'accuracy': 0.5384615384615384, 'auc-roc': 0.4919071076706545, 'f1': 0.3333333333333333, 'precision': 0.36, 'recall': 0.3103448275862069, 'balanced_accuracy': 0.4919071076706545}
Fold 2
{'keys': ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy'], 'accuracy': 0.48717948717948717, 'auc-roc': 0.4605978260869566, 'f1': 0.3333333333333333, 'precision': 0.35714285714285715, 'recall': 0.3125, 'balanced_accuracy': 0.46059782608695654}
Fold 3
{'keys': ['accuracy', 'auc-roc', 'f1', 'precision', 'recall', 'balanced_accuracy'], 'accuracy': 0.5128205128205128, 'auc-roc': 0.47459893048128343, 'f1': 0.24, 'precision': 0