In [303]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chbmit/chbmit_preprocessed_data.csv


# ***Reading the Dataset***

In [304]:
df=pd.read_csv("/kaggle/input/chbmit/chbmit_preprocessed_data.csv")
df.shape

(2097150, 24)

# ***Reducing the dataset due to computational problems***

In [305]:
non_epileptic_dataset = df[df['Outcome'] == 0]
epileptic_dataset = df[df['Outcome'] == 1]
sample_size = round((df.shape[0])/100)
sample_size

20972

# ***Sampling of Data***

In [306]:
non_epileptic_dataset_new = non_epileptic_dataset.sample(sample_size,random_state = 7)
epileptic_dataset_new = epileptic_dataset.sample(sample_size,random_state = 7)
non_epileptic_dataset_new.shape
final_dataset = pd.concat([non_epileptic_dataset_new,epileptic_dataset_new])

# non_epileptic_dataset.sample(5,24,287)

# ***Data Cleaning and Preprocessing***

In [307]:
final_dataset.isnull().sum()

# FP1-F7    0
C3-P3       0
C4-P4       0
CZ-PZ       0
F3-C3       0
F4-C4       0
F7-T7       0
F8-T8       0
FP1-F3      0
FP2-F4      0
FP2-F8      0
FT10-T8     0
FT9-FT10    0
FZ-CZ       0
P3-O1       0
P4-O2       0
P7-O1       0
P7-T7       0
P8-O2       0
T7-FT9      0
T7-P7       0
T8-P8-0     0
T8-P8-1     0
Outcome     0
dtype: int64

In [308]:
## Slicing the final dataset for data and target
data = final_dataset.drop(['Outcome'],axis=1)
target = final_dataset['Outcome']

In [309]:
## Splitting the data in Train_Test split
from sklearn import model_selection
x_train,x_test,y_train,y_test = model_selection.train_test_split(data,target)

In [310]:
## Checking for class imbalance
from collections import Counter
Counter(target)

Counter({0.0: 20972, 1.0: 20972})

# ***As there is NO Class Imabalance we are good to go!***

In [311]:
## tranforming values to using standard scalar

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)
scaler.transform(x_test)

array([[-0.04226211, -0.08369388, -0.74548827, ..., -0.21916012,
        -0.04862136, -0.04862136],
       [-0.48273516,  2.31879507,  2.46642665, ...,  3.60003242,
         1.13077066,  1.13077066],
       [ 2.62888016,  0.20935232, -0.20933788, ..., -0.15307056,
         0.03298936,  0.03298936],
       ...,
       [ 4.98916526, -1.33767368,  2.7624795 , ..., -0.41435486,
         3.47982163,  3.47982163],
       [ 0.27198628, -0.20457169, -0.15658605, ..., -0.12002578,
        -0.03557702, -0.03557702],
       [ 0.2503566 , -2.77135048,  0.38244172, ..., -5.79816386,
        -0.34228618, -0.34228618]])

In [312]:
## Creating a dataset hybrid_model_dataset for our evaluation of hybrid model later
hybrid_model_dataset = pd.DataFrame()

## Adding target column in our hybrid Dataset

hybrid_model_dataset['Outcome'] = target
hybrid_model_dataset.shape

# creating a score list to plot at last to compare models

accuracy = []

# ***Logistic Regression***

In [331]:
## Applying Logistic Regression Classsifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

lr = LogisticRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)
print(lr.score(x_test,y_test)*100)

## Adding score to accuracy list

accuracy.append(lr.score(x_test,y_test)*100)

# Adding the predicted values of Complete dataset (train + test) in hybrid_dataset

hybrid_model_dataset['logistic_regression'] = lr.predict(data)
hybrid_model_dataset.shape

49.675758153728786


(41944, 7)

# ***Decision Tree***

In [332]:
## Applying Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
print(dt.score(x_test,y_test)*100)

## Adding score to accuracy list

accuracy.append(dt.score(x_test,y_test)*100)

# Adding the predicted values of Complete dataset (train + test) in hybrid_dataset

hybrid_model_dataset['Decision TREE'] = dt.predict(data)
hybrid_model_dataset.shape

72.95441541102421


(41944, 7)

# ***Random Forest***

In [333]:
## Applying RandomForest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# param_grid = {
#     "n_estimators": [100, 200],
#     "max_depth": [3, 4, 5],
#     "min_samples_split": [2, 3],
#     "min_samples_leaf": [1, 2],
# }
# grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
# grid_search.fit(x_train, y_train)
# print(grid_search.best_params_)
#**grid_search.best_params_
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)
print(rf.score(x_test,y_test)*100)

## Adding score to accuracy list

accuracy.append(rf.score(x_test,y_test)*100)

# Adding the predicted values of Complete dataset (train + test) in hybrid_dataset

hybrid_model_dataset['Random_forest'] = rf.predict(data)
hybrid_model_dataset.shape

82.44325767690253


(41944, 7)

# ***Support Vector Machine***

In [334]:
## Applying Support Vector Machine Classifier
from sklearn import svm
# param_grid = {
#     "C": [1e2, 1e3, 1e4, 4e5, 1e5],
#     "gamma": [1e-3, 5e-4, 1e-4,5e-3]
# }
# grid_search = GridSearchCV(svm.SVC(), param_grid, cv=3)
# grid_search.fit(x_train, y_train)
# print(grid_search.best_params_)
sv = svm.SVC()
sv.fit(x_train,y_train)
sv.score(x_test,y_test)
print(sv.score(x_test,y_test)*100)

## adding score to accuracy list

accuracy.append(sv.score(x_test,y_test)*100)

## adding the predictions to hybrid dataset

hybrid_model_dataset['SVM'] = sv.predict(data)
hybrid_model_dataset.shape

81.84245660881176


(41944, 7)

# ***KNN***

In [337]:
## Applying K Nearest Neighbour Classifier
from sklearn.neighbors import KNeighborsClassifier
# param_grid = {
#     "n_neighbors": [1, 3, 5, 7, 9],
#    "metric": ["euclidean", "manhattan", "minkowski"],
# }
# grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
# grid_search.fit(x_train, y_train)
# print(grid_search.best_params_)
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
print(knn.score(x_test,y_test)*100)

## adding score to accuracy list
accuracy.append(knn.score(x_test,y_test)*100)

## adding the predictions to hybrid dataset

hybrid_model_dataset['KNN'] = knn.predict(data)
hybrid_model_dataset.shape

77.96109097844746


(41944, 7)

# ***XG Booost***

In [338]:
## Applying Extreme Gradient Boost Classifier
import xgboost
from sklearn.metrics import roc_auc_score
model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

eval_set = [(x_test, y_test)]

model_xgboost.fit(x_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)
y_train_pred = model_xgboost.predict_proba(x_train)[:,1]
y_test_pred = model_xgboost.predict_proba(x_test)[:,1]

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))
## adding score to accuracy list
accuracy.append(dt.score(x_test,y_test)*100)

## adding the predictions to hybrid dataset

hybrid_model_dataset['XGBoost'] = model_xgboost.predict(data)
hybrid_model_dataset.shape

[0]	validation_0-auc:0.78227
[1]	validation_0-auc:0.83171
[2]	validation_0-auc:0.84353
[3]	validation_0-auc:0.84963
[4]	validation_0-auc:0.85772
[5]	validation_0-auc:0.86397




[6]	validation_0-auc:0.86922
[7]	validation_0-auc:0.87281
[8]	validation_0-auc:0.87330
[9]	validation_0-auc:0.87411
[10]	validation_0-auc:0.87435
[11]	validation_0-auc:0.87547
[12]	validation_0-auc:0.87638
[13]	validation_0-auc:0.87688
[14]	validation_0-auc:0.87767
[15]	validation_0-auc:0.87826
[16]	validation_0-auc:0.87833
[17]	validation_0-auc:0.87907
[18]	validation_0-auc:0.87988
[19]	validation_0-auc:0.88051
[20]	validation_0-auc:0.88059
[21]	validation_0-auc:0.88142
[22]	validation_0-auc:0.88210
[23]	validation_0-auc:0.88236
[24]	validation_0-auc:0.88269
[25]	validation_0-auc:0.88299
[26]	validation_0-auc:0.88329
[27]	validation_0-auc:0.88340
[28]	validation_0-auc:0.88377
[29]	validation_0-auc:0.88392
[30]	validation_0-auc:0.88411
[31]	validation_0-auc:0.88410
[32]	validation_0-auc:0.88434
[33]	validation_0-auc:0.88451
[34]	validation_0-auc:0.88477
[35]	validation_0-auc:0.88489
[36]	validation_0-auc:0.88503
[37]	validation_0-auc:0.88543
[38]	validation_0-auc:0.88555
[39]	validatio

(41944, 7)

# ***Analysing the data set for hybrid model***

In [339]:
## Checking the hybrid_model_dataset
hybrid_model_dataset.head()

Unnamed: 0,Outcome,logistic_regression,Decision TREE,Random_forest,SVM,KNN,XGBoost
497803,0.0,1.0,0.0,0.0,0.0,1.0,0
522706,0.0,1.0,0.0,0.0,1.0,0.0,1
704218,0.0,1.0,0.0,0.0,0.0,1.0,0
857273,0.0,1.0,0.0,1.0,1.0,0.0,1
823617,0.0,1.0,0.0,0.0,0.0,0.0,0


In [340]:
## slicing hybrid_model_dataset for train_test split

X = hybrid_model_dataset.drop(['Outcome'],axis=1)
Y = hybrid_model_dataset['Outcome']

## Performing the train test split

from sklearn import model_selection
xx_train,xx_test,yy_train,yy_test = model_selection.train_test_split(X,Y)

# ***Training the Hybrid Model***

In [341]:
## Applying XG Boost Classifier to our Hybrid Model

from sklearn.metrics import roc_auc_score
hybrid_model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

eval_set = [(xx_test, yy_test)]

hybrid_model_xgboost.fit(xx_train,
                  yy_train,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)
yy_train_pred = hybrid_model_xgboost.predict_proba(xx_train)[:,1]
yy_test_pred = hybrid_model_xgboost.predict_proba(xx_test)[:,1]

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(yy_train, yy_train_pred),
                                                    roc_auc_score(yy_test, yy_test_pred)))
## adding score to accuracy list
accuracy.append(roc_auc_score(yy_test, yy_test_pred)*100)


[0]	validation_0-auc:0.96716
[1]	validation_0-auc:0.97471
[2]	validation_0-auc:0.97520
[3]	validation_0-auc:0.97549
[4]	validation_0-auc:0.97516
[5]	validation_0-auc:0.97504
[6]	validation_0-auc:0.97542
[7]	validation_0-auc:0.97542
[8]	validation_0-auc:0.97542
[9]	validation_0-auc:0.97542
[10]	validation_0-auc:0.97543
[11]	validation_0-auc:0.97543
[12]	validation_0-auc:0.97560
[13]	validation_0-auc:0.97558
[14]	validation_0-auc:0.97539




[15]	validation_0-auc:0.97529
[16]	validation_0-auc:0.97535
[17]	validation_0-auc:0.97539
[18]	validation_0-auc:0.97541
[19]	validation_0-auc:0.97541
[20]	validation_0-auc:0.97541
[21]	validation_0-auc:0.97541
AUC Train: 0.9759
AUC Valid: 0.9756


# ***Printing confusion matrix and Classification report***

In [342]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = lr.predict(x_test)

print('CONFUSION MATRIX')
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)


CONFUSION MATRIX
[[   0 5277]
 [   0 5209]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [323]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

y_pred = dt.predict(x_test)

print('CONFUSION MATRIX')
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



CONFUSION MATRIX
[[3901 1376]
 [1404 3805]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [324]:
y_pred = rf.predict(x_test)
print('CONFUSION MATRIX')

print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



CONFUSION MATRIX
[[4255 1022]
 [ 868 4341]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [325]:
y_pred = sv.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



[[4642  635]
 [1269 3940]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [326]:
y_pred = knn.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



[[4849  428]
 [1883 3326]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [327]:
y_pred = model_xgboost.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



[[4465  812]
 [1053 4156]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



In [328]:
y_pred = hybrid_model_xgboost.predict(xx_test)
print(confusion_matrix(y_test, y_pred))
print('-------------------------------------------------------------------------')

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Print the classification report
print('CLASSIFICATION REPORT')

print(report)



[[2628 2649]
 [2536 2673]]
-------------------------------------------------------------------------
CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74      5314
         1.0       0.73      0.73      0.73      5172

    accuracy                           0.73     10486
   macro avg       0.73      0.73      0.73     10486
weighted avg       0.73      0.73      0.73     10486



# ***Plotting to compare our different models***

In [343]:
accuracy

[49.675758153728786,
 73.4884608048827,
 81.97596795727637,
 81.84245660881176,
 77.96109097844746,
 73.4884608048827,
 97.55323196620526,
 49.675758153728786,
 72.95441541102421,
 82.44325767690253,
 81.84245660881176,
 77.96109097844746,
 77.96109097844746,
 77.96109097844746,
 72.95441541102421,
 97.56003962502473]

In [330]:
accuracy
X_values = list(X)
Y_values = accuracy
Y_axis
plt.bar(X_axis,Y_axis,color = 'brown',width = 0.5)
plt.xlabel('MODELS USED')
plt.ylabel('ACCURACY ACHEIVED')
plt.title("COMPARING OUR MODELS")
plt.legend()
for i, (x, y) in enumerate(zip(X_values, Y_values)):
    plt.text(x, y, str(y), ha='center', va='bottom', color='blue',fontsize = 7)


plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x7a9c04ab9200> (for post_execute):


ConversionError: Failed to convert value(s) to axis units: 'logistic_regression'

ConversionError: Failed to convert value(s) to axis units: 'logistic_regression'

<Figure size 800x550 with 1 Axes>