In [31]:
%matplotlib inline
%pylab inline
import numpy as np
from scipy.io import loadmat
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [55]:
TRAIN_DATA_PATH = "/Users/svegal/Downloads/train_{}/"
TRAIN_DATA_CHUNKS = (1, )
PREICTAL = 1
INTERICTAL = 0

### Get the data

Load samples of interictal and preictal data

In [12]:
def mat_to_pandas(path):
    mat = loadmat(path, verify_compressed_data_integrity=False)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0])

In [59]:
import random

IGNORED_FILES = set(['.DS_Store'])

def is_class(class_id, filename):
    if filename in IGNORED_FILES:
        return False
    return int(filename.rstrip('.mat')[-1]) == class_id

def filenames_for_class(path, class_id, n_filenames):
    return random.sample([f for f in os.listdir(path) if is_class(class_id, f)], n_filenames)

In [64]:
path = TRAIN_DATA_PATH.format(TRAIN_DATA_CHUNKS[0])
print("Interictal", len([f for f in os.listdir(path) if is_class(0, f)]))
print("Preictal", len([f for f in os.listdir(path) if is_class(1, f)]))

Interictal 1152
Preictal 150


In [160]:
def add_correlations(channels_data):
    correlations = None
    for i in range(16):
        correlations_i = np.array([])
        for j in range (16):
            if i != j:
                corr_i = correlate(channels_data[i], channels_data[j], mode='same')
                correlations_i = np.concatenate([correlations_i, corr_i])
        correlations = np.vstack([correlations, correlations_i]) if correlations is not None else correlations_i
    return np.column_stack([channels_data, correlations])


def generate_features(data):
    channels_data = resample(data.transpose(), 600, axis=1, window=400)  # 1 entry per second
    channels_data = add_correlations(channels_data)
    return channels_data

In [161]:
from scipy.signal import resample

def train_dataset(n_files, n_preictal):
    """Create a train dataset from randomly chosen n_files, where the share of preictal data is share_preictal"""
    PATH = TRAIN_DATA_PATH.format(TRAIN_DATA_CHUNKS[0])
    n_interictal = n_files - n_preictal
    X = None
    y = None
    for i, n in enumerate([n_interictal, n_preictal]):
        filenames = filenames_for_class(PATH, i, n)
        for f in filenames:
            channels_data = generate_features(mat_to_pandas(PATH + '/' + f))
            y_data = np.repeat(i, 16, axis=0)
            X = np.vstack([X, channels_data]) if X is not None else channels_data
            y = np.concatenate([y, y_data]) if y is not None else y_data
    return X, y


In [163]:
%time X_half, y_half = train_dataset(300, 150)
print(X_half.shape, y_half.shape)

CPU times: user 1min 15s, sys: 46.8 s, total: 2min 2s
Wall time: 2min 10s
(4800, 9600) (4800,)


### Preprocessing

In [173]:
from sklearn.preprocessing import normalize

# Normalizes the data
normalize(X_half, copy=False)

array([[  2.71441032e-05,   2.25850828e-05,   2.64450612e-05, ...,
          1.29462891e-03,   2.01110609e-03,   1.60979130e-03],
       [ -1.72742677e-05,  -1.94196783e-05,  -1.02782028e-05, ...,
         -1.56526192e-04,  -3.73398047e-04,  -6.34001805e-04],
       [  1.17846002e-06,   1.74172701e-05,   1.20380717e-05, ...,
         -2.98124539e-03,  -2.38665110e-03,  -1.48251133e-03],
       ..., 
       [ -2.62965026e-05,   6.79575209e-05,   2.47284413e-05, ...,
          2.24133631e-03,   9.69818857e-04,  -2.37030196e-03],
       [  1.04465608e-04,   4.76027187e-05,  -3.18499957e-05, ...,
          1.33280056e-03,   4.47960997e-03,   1.22885411e-03],
       [  9.92667152e-05,   5.96727001e-06,   7.01727021e-05, ...,
          6.11471182e-03,   7.84554719e-04,   4.06444971e-03]])

### Building the model

In [124]:
from sklearn.cross_validation import train_test_split
from sklearn import linear_model

In [154]:
class Model(object):
    def __init__(self, X, y):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        self.clf = linear_model.LogisticRegression(C=16, n_jobs=3, verbose=5)
        self.clf.fit(self.X_train, self.y_train)
        self.y_pred = self.clf.predict(self.X_test)

In [169]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def metrics(model):
    funcs = dict(
        accuracy_score=accuracy_score, 
        precision_score=precision_score, 
        recall_score=recall_score,
        f1_score=f1_score,
        roc_auc_score=roc_auc_score,
    )
    result = {k: v(model.y_test, model.y_pred) for k, v in funcs.items()}
    for k, v in result.items():
        print(k.title(), v)
    return result

In [174]:
model_part = Model(X_half, y_half)

[LibLinear]

In [175]:
half_metrics = metrics(model_part)

Accuracy_Score 0.619166666667
Recall_Score 0.664429530201
Precision_Score 0.606431852986
F1_Score 0.634107285829
Roc_Auc_Score 0.61946642073
