https://github.com/leopiney/nih-seizure-prediction/blob/master/work/Playground.ipynb
https://s3.amazonaws.com/MLMastery/machine_learning_mastery_with_python_sample.pdf
https://github.com/leopiney/nih-seizure-prediction/blob/master/work/NIH%20Seizure%20Prediction.ipynb
https://github.com/QuantScientist/Seizure-Prediction-2016/blob/master/sh_lib_fm_seizure_prediction.ipynb
https://www.kaggle.com/c/melbourne-university-seizure-prediction/data?sample_submission.csv


In [42]:

import scipy.io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [3]:
def get_class_from_name(name):
    """
    Gets the class from the file name.
    
    The class is defined by the last number written in the file name.
    For example:
    
    Input: ".../1_1_1.mat"
    Output: 1.0
    
    Input: ".../1_1_0.mat"
    Output: 0.0
    """
    try:
        return float(name[-5])
    except:
        return 0.0

assert get_class_from_name('/train_1/1_1_0.mat') == 0.0
assert get_class_from_name('/train_1/1_1_1.mat') == 1.0

In [4]:
from scipy.fftpack import rfft


def get_X_files_and_y(base_dir, train_samples=600):
    ignored_files = ['.DS_Store', '1_45_1.mat']

    X_files = np.array([])
    y = np.array([])

    for i, filename in enumerate(os.listdir(base_dir)):
        if filename in ignored_files:
            continue

        X_files = np.append(X_files, str(filename))
        y = np.append(y, get_class_from_name(filename))  # The number of readings

        if i >= train_samples:
            break
    
    return X_files, y


def get_X_from_files(base_dir, files, show_progress=True):
    """
    Given a list of filenames, returns the Standard deviation of the content of each file as a row.
    """
    X = None
    n = 128

    total_files = len(files)

    for i, filename in enumerate(files):
        if show_progress and i % int(total_files / 10) == 0:
            print(u'%{}: Loading file {}'.format(int(i * 100 / total_files), filename))

        try:
            mat_data = scipy.io.loadmat(''.join([base_dir, filename]))
        except ValueError as ex:
            print(u'Error loading MAT file {}: {}'.format(filename, str(ex)))
            continue

        data = mat_data['dataStruct'][0][0][0]
        x_fft = rfft(data, n=n, axis=0)

        X = np.vstack([X, x_fft]) if X is not None else x_fft
    
    return X

In [51]:
import os
import numpy as np
from sklearn.cross_validation import train_test_split

base = r'C:\data\seizure\train_1\\'
base_tests = r'C:\data\seizure\test_1\\'
    
X_files, y = get_X_files_and_y(base, train_samples=1000)
X_train_files, X_test_files, y_train, y_test = train_test_split(X_files, y, test_size=0.33, random_state=42)

In [52]:
print(len(X_files))
print(len(y))

1000
1000


In [53]:
%time X_train = get_X_from_files(base_dir=base, files=X_train_files)
%time X_test = get_X_from_files(base_dir=base, files=X_test_files)
print(u'X_train shape: {} - y_train shape: {}'.format(X_train.shape, y_train.shape))
print(u'X_test shape: {} - y_test shape: {}'.format(X_test.shape, y_test.shape))

%0: Loading file 1_510_0.mat
%10: Loading file 1_298_0.mat
%20: Loading file 1_254_0.mat
%30: Loading file 1_1058_0.mat
%40: Loading file 1_3_1.mat
%50: Loading file 1_133_1.mat
%60: Loading file 1_69_1.mat
%70: Loading file 1_629_0.mat
%80: Loading file 1_342_0.mat
%90: Loading file 1_140_0.mat
Wall time: 2min 4s
%0: Loading file 1_361_0.mat
%10: Loading file 1_343_0.mat
%20: Loading file 1_479_0.mat
%30: Loading file 1_611_0.mat
%40: Loading file 1_151_0.mat
%50: Loading file 1_483_0.mat
%60: Loading file 1_437_0.mat
%70: Loading file 1_74_1.mat
%80: Loading file 1_227_0.mat
%90: Loading file 1_286_0.mat
Wall time: 1min 4s
X_train shape: (85760L, 16L) - y_train shape: (670L,)
X_test shape: (42240L, 16L) - y_test shape: (330L,)


In [55]:
print(len(X_train))
print(len(y_train))
print('--------')
print(len(X_test))
print(len(y_test))

85760
670
--------
42240
330


In [49]:
X_train[:5]

array([[ -522.83203125, -2230.59814453,  1213.36413574, -6526.05859375,
        -1849.34411621,   460.59753418,  -730.86401367,  -184.24256897,
         -376.07785034,  2654.484375  ,   867.40881348,   770.71661377,
         2509.0925293 ,  3169.86572266,   835.84240723,   187.69282532],
       [ 1018.17175293,  1230.95703125,  -809.96130371, -1067.27197266,
        -2720.84667969, -2980.80371094, -1877.52319336, -1476.08032227,
         2715.11450195,  1885.99450684,  -581.54443359,    79.69972229,
         -655.93640137,  6191.70214844,   748.34313965, -2019.03833008],
       [-1347.70117188,  -533.51928711,  -583.28344727, -1643.6887207 ,
           24.52093315, -1588.49865723,  2331.15161133,  -763.27905273,
        -1485.48095703,  -747.85113525,  2346.11181641, -1469.6229248 ,
         2892.92529297, -1577.02380371,  5108.41601562,  -907.89154053],
       [ -552.44030762,  -672.64904785,  -104.58300781,  1196.57897949,
          595.27740479,   298.974823  ,  -286.58587646,    36

In [81]:
#len(y_train[:6])

In [56]:
y_train[:5]

array([ 0.,  1.,  0.,  1.,  0.])

In [57]:
# base_dir = 'c:\\data\\seizure\\train1'
# for i, filename in enumerate(os.listdir(base_dir)):
#     print(fileName)

In [58]:
# os.listdir('./seizure/train1')

In [64]:
# import os
# startpath = r'c:data\seizure\test_1\'
# corpus_path = sorted([os.path.join("c:", "data", directories) for directories in os.listdir(startpath)])
# print(corpus_path)

In [65]:
#this works
# from os import listdir
# from os.path import isfile, join
# mypath = r"C:\data\seizure\test_1"
# onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(onlyfiles)

In [67]:
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [71]:
num_folds = 10
num_instances = len(X_train)
kfold_seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=kfold_seed) 
model = LogisticRegression()

In [72]:
def cross_validation_score(mod, X, Y, kfold, scoring):
    score = cross_validation.cross_val_score(mod, X, Y, cv=kfold, scoring=scoring) 
    return (("Scoring: %s; Accuracy: %.3f (%.3f)") % (scoring, score.mean(), score.std()), score)

In [75]:
cross_val_scoring = ['accuracy','log_loss', 'roc_auc' ] #classifiction metrics
#['r2','mean_absolute_error','mean_squared_error'] - regression metrics
for scoring in cross_val_scoring:    
    result = cross_validation_score(model, X_train[:670], y_train, kfold, scoring)
    print(result[0])

IndexError: index 8576 is out of bounds for size 670