In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
%matplotlib inline

### Read the csv data file into dataframe

In [2]:
data = pd.read_csv('data_seizure.csv')

In [4]:
data.shape

(11500, 180)

### Target Variable frequency

In [5]:
data.y.value_counts()

5    2300
4    2300
3    2300
2    2300
1    2300
Name: y, dtype: int64

In [6]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
0,X21.V1.791,135,190,229,223,192,125,55,-9,-33,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,4
1,X15.V1.924,386,382,356,331,320,315,307,272,244,...,164,150,146,152,157,156,154,143,129,1
2,X8.V1.1,-32,-39,-47,-37,-32,-36,-57,-73,-85,...,57,64,48,19,-12,-30,-35,-35,-36,5
3,X16.V1.60,-105,-101,-96,-92,-89,-95,-102,-100,-87,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,5
4,X20.V1.54,-9,-65,-98,-102,-78,-48,-16,0,-21,...,4,2,-12,-32,-41,-65,-83,-89,-73,5
5,X14.V1.56,55,28,18,16,16,19,25,40,52,...,-12,-31,-42,-54,-60,-64,-60,-56,-55,5
6,X3.V1.191,-55,-9,52,111,135,129,103,72,37,...,-125,-99,-79,-62,-41,-26,11,67,128,4
7,X11.V1.273,1,-2,-8,-11,-12,-17,-15,-16,-18,...,-79,-91,-97,-88,-76,-72,-66,-57,-39,2
8,X19.V1.874,-278,-246,-215,-191,-177,-167,-157,-139,-118,...,-400,-379,-336,-281,-226,-174,-125,-79,-40,1
9,X3.V1.491,8,15,13,3,-6,-8,-5,4,25,...,49,31,11,-5,-17,-19,-15,-15,-11,4


In [7]:
data.drop(labels=['Unnamed: 0'],axis=1,inplace=True)

### Mapping the different target to binary ( to classifiy whether seizure or not)

In [8]:
map_y = {1: 1, 2: 0, 3: 0,4:0,5:0}

In [9]:
data.y = data.y.map(map_y)

In [10]:
seizure_group = data.groupby('y')
seizure_group.size()

y
0    9200
1    2300
dtype: int64

In [11]:
data_y = data.y

In [12]:
data_x = data.iloc[:,0:178]

### Do standard scaling to bring all the features to same scale

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
x_scaled = pd.DataFrame(StandardScaler().fit_transform(data_x))

In [15]:
from sklearn.decomposition import PCA

### Try PCA for dimensionality reduction

In [16]:
pca = PCA(n_components=50)
principalComponents = pca.fit_transform(x_scaled)
pca.explained_variance_ratio_.sum()

0.9868045289149014

In [17]:
principalComponents.shape

(11500, 50)

###  Split data into train , validation sets

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(principalComponents, data_y, test_size=0.30, random_state=354)

### Try gradient boosting Tree classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
best_gbm = GradientBoostingClassifier(max_depth=15,min_samples_split=30,min_samples_leaf=45,subsample=0.8,verbose=1
                                      ,random_state=354,n_estimators=350, learning_rate = 0.05)

In [22]:
best_gbm

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=15,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=45, min_samples_split=30,
              min_weight_fraction_leaf=0.0, n_estimators=350,
              presort='auto', random_state=354, subsample=0.8, verbose=1,
              warm_start=False)

In [23]:
best_gbm.fit(X_train,Y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9283           0.0715           35.00s
         2           0.8564           0.0605           38.17s
         3           0.7985           0.0518           37.67s
         4           0.7643           0.0415           37.69s
         5           0.7161           0.0394           38.06s
         6           0.6718           0.0350           38.58s
         7           0.6332           0.0351           39.13s
         8           0.6072           0.0291           40.01s
         9           0.5786           0.0270           40.44s
        10           0.5493           0.0256           39.92s
        20           0.3583           0.0124           34.70s
        30           0.2573           0.0073           31.84s
        40           0.1925           0.0041           30.70s
        50           0.1485           0.0020           29.63s
        60           0.1185           0.0016           28.67s
       

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=15,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=45, min_samples_split=30,
              min_weight_fraction_leaf=0.0, n_estimators=350,
              presort='auto', random_state=354, subsample=0.8, verbose=1,
              warm_start=False)

In [24]:
best_gbm.score(X_train,Y_train)

1.0

In [25]:
best_gbm.score(X_test,Y_test)

0.9791304347826087

In [26]:
pred_quality = best_gbm.predict(X_test)

### Create classification Report 

In [27]:
from sklearn.metrics import classification_report,confusion_matrix

In [28]:
print(classification_report(Y_test,pred_quality))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      2746
          1       0.96      0.94      0.95       704

avg / total       0.98      0.98      0.98      3450

