In [2]:
from glob import glob
import os
import mne
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
all_file_path = glob('dataset/*.edf')

print(len(all_file_path))

28


In [8]:
healthy_file_path = [i for i in all_file_path if 'h' in i.split('\\')[1]]
patient_file_path = [i for i in all_file_path if 's' in i.split('\\')[1]]

print(len(healthy_file_path),len(patient_file_path))

14 14


In [15]:
def read_data(file_path):
    data = mne.io.read_raw_edf(file_path,preload=True)
    data.set_eeg_reference()
    data.filter(l_freq=0.5,h_freq=45)
    epochs = mne.make_fixed_length_epochs(data,duration=5,overlap=1)
    
    array = epochs.get_data()
    
    return array

In [16]:
sample_data = read_data(healthy_file_path[0])

Extracting EDF parameters from C:\Users\Piyush\Machine Learning CAC2\dataset\h01.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 231249  =      0.000 ...   924.996 secs...
EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1651 samples (6.604 s)

Not setting metadata
231 matching events found
No baseline correction applied
0 projection items 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    0.0s finished


In [17]:
sample_data.shape #no of epochs, channels, lenght of signal

(231, 19, 1250)

In [19]:
%%capture
control_epochs_array = [read_data(i) for i in healthy_file_path ]
patient_epochs_array = [read_data(i) for i in patient_file_path ]

In [22]:
control_epochs_array[0].shape , control_epochs_array[1].shape

((231, 19, 1250), (227, 19, 1250))

In [27]:
control_epoch_labels = [len(i)*[0] for i in control_epochs_array]
patient_epoch_labels = [len(i)*[1] for i in patient_epochs_array]

len(control_epoch_labels),len(patient_epoch_labels)

(14, 14)

In [28]:
data_list = control_epochs_array+patient_epochs_array
label_list = control_epoch_labels+patient_epoch_labels

In [29]:
group_list = [[i]*len(j) for i,j in enumerate(data_list)]

len(group_list)

28

In [30]:
data_array = np.vstack(data_list)
label_array = np.hstack(label_list)
group_array = np.hstack(group_list)

print(data_array.shape , label_array.shape , group_array.shape)


(7201, 19, 1250) (7201,) (7201,)


In [32]:
from scipy import stats

def mean(x):
    return np.mean(data,axis=-1)

def std(data):
    return np.std(data,axis=-1)

def ptp(data):
    return np.ptp(data,axis=-1)

def var(data):
        return np.var(data,axis=-1)

def minim(data):
      return np.min(data,axis=-1)


def maxim(data):
      return np.max(data,axis=-1)

def argminim(data):
      return np.argmin(data,axis=-1)


def argmaxim(data):
      return np.argmax(data,axis=-1)

def mean_square(data):
      return np.mean(data**2,axis=-1)

def rms(data): #root mean square
      return  np.sqrt(np.mean(data**2,axis=-1))  

def abs_diffs_signal(data):
    return np.sum(np.abs(np.diff(data,axis=-1)),axis=-1)


def skewness(data):
    return stats.skew(data,axis=-1)

def kurtosis(data):
    return stats.kurtosis(data,axis=-1)

def concatenate_features(data):
    return np.concatenate((mean(data),std(data),ptp(data),var(data),minim(data),maxim(data),argminim(data),argmaxim(data),
                          mean_square(data),rms(data),abs_diffs_signal(data),
                          skewness(data),kurtosis(data)),axis=-1)
    

In [41]:
features = []
for d in data_array:
    features.append(concatenate_features(d))

In [42]:
features_array = np.array(features)
features_array.shape

(7201, 247)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold,GridSearchCV,cross_val_score,cross_validate 

In [49]:
clf = LogisticRegression(max_iter=10000)
gkf = GroupKFold(5)
pipe = Pipeline([('scaler',StandardScaler()),('clf',clf)])
param_grid = {'clf__C':[0.1,0.5,0.7,1,3,5,7]}
gscv = GridSearchCV(pipe,param_grid,cv=gkf,n_jobs=12)
gscv.fit(features_array,label_array,groups=group_array)

In [50]:
gscv.best_score_

0.6687967529031762