https://github.com/leopiney/nih-seizure-prediction/blob/master/work/Playground.ipynb
https://s3.amazonaws.com/MLMastery/machine_learning_mastery_with_python_sample.pdf
https://github.com/leopiney/nih-seizure-prediction/blob/master/work/NIH%20Seizure%20Prediction.ipynb
https://github.com/QuantScientist/Seizure-Prediction-2016/blob/master/sh_lib_fm_seizure_prediction.ipynb
https://www.kaggle.com/c/melbourne-university-seizure-prediction/data?sample_submission.csv


In [2]:

import scipy.io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [3]:
def get_class_from_name(name):
    """
    Gets the class from the file name.
    
    The class is defined by the last number written in the file name.
    For example:
    
    Input: ".../1_1_1.mat"
    Output: 1.0
    
    Input: ".../1_1_0.mat"
    Output: 0.0
    """
    try:
        return float(name[-5])
    except:
        return 0.0

assert get_class_from_name('/train_1/1_1_0.mat') == 0.0
assert get_class_from_name('/train_1/1_1_1.mat') == 1.0

In [4]:
from scipy.fftpack import rfft


def get_X_files_and_y(base_dir, train_samples=600):
    ignored_files = ['.DS_Store', '1_45_1.mat']

    X_files = np.array([])
    y = np.array([])

    for i, filename in enumerate(os.listdir(base_dir)):
        if filename in ignored_files:
            continue

        X_files = np.append(X_files, str(filename))
        y = np.append(y, get_class_from_name(filename))  # The number of readings

        if i >= train_samples:
            break
    
    return X_files, y


def get_X_from_files(base_dir, files, show_progress=True):
    """
    Given a list of filenames, returns the Standard deviation of the content of each file as a row.
    """
    X = None
    n = 128

    total_files = len(files)

    for i, filename in enumerate(files):
        if show_progress and i % int(total_files / 10) == 0:
            print(u'%{}: Loading file {}'.format(int(i * 100 / total_files), filename))

        try:
            mat_data = scipy.io.loadmat(''.join([base_dir, filename]))
        except ValueError as ex:
            print(u'Error loading MAT file {}: {}'.format(filename, str(ex)))
            continue

        data = mat_data['dataStruct'][0][0][0]
        x_fft = rfft(data, n=n, axis=0)

        X = np.vstack([X, x_fft]) if X is not None else x_fft
    
    return X

In [5]:
import os
import numpy as np
from sklearn.cross_validation import train_test_split

base = r'C:\data\seizure\train_1\\'
base_tests = r'C:\data\seizure\test_1\\'
    
X_files, y = get_X_files_and_y(base, train_samples=1000)
X_train_files, X_test_files, y_train, y_test = train_test_split(X_files, y, test_size=0.33, random_state=42)

In [25]:
print(len(X_files)) #just file names
print(len(y))
print(len(X_train_files)) #file names
print(len(X_test_files)) #file names
print(len(y_train)) #binary values
print(len(y_test)) #binary values

#print(y_train[:5])

1000
1000
670
330
670
330


In [60]:
def get_X_from_files_data(base_dir, filename, show_progress=True):
    output = None
    n = 128
    mat_data = scipy.io.loadmat(''.join([base_dir, filename]))
    data = mat_data['dataStruct'][0][0][0]
    print(data)
    x_fft = rfft(data, n=n, axis=0)
    #print(x_fft)
    output = np.vstack([output, x_fft]) if output is not None else x_fft
    return output

In [58]:
#X_train_see = get_X_from_files(base_dir=base, files=X_train_files[:1])
fname1 = X_train_files[:1][0]
#print(fname1)
fname2 = '1_3_1.mat'

In [59]:
n = 128
res = get_X_from_files_data(base_dir=base, filename=fname1)
print(res)

[[  3.08377228e+02   7.61208130e+02  -3.33036377e+02 ...,   3.02578064e+02
   -1.03572375e+03   9.08074341e+02]
 [ -1.12125854e+03  -8.55451355e+02  -2.88266846e+03 ...,   5.86717529e+03
    4.26075781e+03  -3.96341992e+03]
 [ -3.25292627e+03  -2.72353223e+03  -3.32947803e+03 ...,  -7.94843262e+03
    5.92156494e+03   1.21478748e+03]
 ..., 
 [  2.46030884e+01   4.97979736e+00   5.21492920e+01 ...,   4.72304688e+01
    2.93537598e+01   2.83518066e+01]
 [ -3.22143555e+01  -2.00600586e+01  -1.67559814e+01 ...,  -8.17927246e+01
   -2.51879883e+01   1.11975098e+00]
 [  3.90364075e+01   3.12818604e+01   5.64648590e+01 ...,   7.82789307e+01
    1.71020508e-01   6.06293030e+01]]


In [7]:
%time X_train = get_X_from_files(base_dir=base, files=X_train_files)
%time X_test = get_X_from_files(base_dir=base, files=X_test_files)
print(u'X_train shape: {} - y_train shape: {}'.format(X_train.shape, y_train.shape))
print(u'X_test shape: {} - y_test shape: {}'.format(X_test.shape, y_test.shape))

%0: Loading file 1_510_0.mat
%10: Loading file 1_298_0.mat
%20: Loading file 1_254_0.mat
%30: Loading file 1_1058_0.mat
%40: Loading file 1_3_1.mat
%50: Loading file 1_133_1.mat
%60: Loading file 1_69_1.mat
%70: Loading file 1_629_0.mat
%80: Loading file 1_342_0.mat
%90: Loading file 1_140_0.mat
Wall time: 4min 30s
%0: Loading file 1_361_0.mat
%10: Loading file 1_343_0.mat
%20: Loading file 1_479_0.mat
%30: Loading file 1_611_0.mat
%40: Loading file 1_151_0.mat
%50: Loading file 1_483_0.mat
%60: Loading file 1_437_0.mat
%70: Loading file 1_74_1.mat
%80: Loading file 1_227_0.mat
%90: Loading file 1_286_0.mat
Wall time: 1min 53s
X_train shape: (85760L, 16L) - y_train shape: (670L,)
X_test shape: (42240L, 16L) - y_test shape: (330L,)


In [8]:
print(len(X_train))
print(len(y_train))
print('--------')
print(len(X_test))
print(len(y_test))

85760
670
--------
42240
330


In [9]:
X_train[:5]

array([[  308.37722778,   761.20812988,  -333.03637695, -1329.16625977,
         1021.93902588,   508.41217041,  1463.21826172,   458.79901123,
         -120.94632721,  -580.20782471,  -314.07348633,  -213.30230713,
        -1688.46362305,   302.57806396, -1035.72375488,   908.07434082],
       [-1121.25854492,  -855.45135498, -2882.66845703, -4577.93847656,
        -2770.00341797,  -347.56015015,  5068.32617188,  1703.92041016,
          532.19494629, -1839.23364258, -2710.484375  ,  -345.62872314,
         4111.14892578,  5867.17529297,  4260.7578125 , -3963.41992188],
       [-3252.92626953, -2723.53222656, -3329.47802734, -4080.19726562,
          265.92858887,  1673.14770508,  4734.2265625 ,   627.47485352,
        -2102.30712891, -1505.67260742,  3201.19262695,   264.6618042 ,
         7189.05273438, -7948.43261719,  5921.56494141,  1214.78747559],
       [  478.37713623,   680.8470459 ,   750.37316895,  -542.20751953,
         -965.28112793, -2325.27856445, -2823.27880859, -4091

In [10]:
#len(y_train[:6])

In [11]:
y_train[:5]

array([ 0.,  1.,  0.,  1.,  0.])

In [12]:
# base_dir = 'c:\\data\\seizure\\train1'
# for i, filename in enumerate(os.listdir(base_dir)):
#     print(fileName)

In [13]:
# os.listdir('./seizure/train1')

In [14]:
# import os
# startpath = r'c:data\seizure\test_1\'
# corpus_path = sorted([os.path.join("c:", "data", directories) for directories in os.listdir(startpath)])
# print(corpus_path)

In [15]:
#this works
# from os import listdir
# from os.path import isfile, join
# mypath = r"C:\data\seizure\test_1"
# onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(onlyfiles)

In [16]:
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [17]:
num_folds = 10
num_instances = len(X_train)
kfold_seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=kfold_seed) 
model = LogisticRegression()

In [18]:
def cross_validation_score(mod, X, Y, kfold, scoring):
    score = cross_validation.cross_val_score(mod, X, Y, cv=kfold, scoring=scoring) 
    return (("Scoring: %s; Accuracy: %.3f (%.3f)") % (scoring, score.mean(), score.std()), score)

In [19]:
cross_val_scoring = ['accuracy','log_loss', 'roc_auc' ] #classifiction metrics
#['r2','mean_absolute_error','mean_squared_error'] - regression metrics
for scoring in cross_val_scoring:    
    result = cross_validation_score(model, X_train[:670], y_train, kfold, scoring)
    print(result[0])

IndexError: index 8576 is out of bounds for size 670