In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from matplotlib import colors 
from matplotlib.ticker import PercentFormatter 
import numpy as np 
import matplotlib.pyplot as plt 

import os
print(os.listdir("../input"))

abnormal = pd.read_csv("../input/ptbdb_abnormal.csv", header = None) 
normal = pd.read_csv("../input/ptbdb_normal.csv", header = None)

abnormal = abnormal.drop([187], axis=1)
normal = normal.drop([187], axis=1)

abnormal.head

In [None]:
flatten_ab_y = (abnormal.values)
flatten_ab_y  = flatten_ab_y[:,5:70].flatten()

print(flatten_ab_y.shape)

ab_x=np.arange(0,65)
ab_x = np.tile(ab_x, abnormal.shape[0])

plt.hist2d(ab_x, flatten_ab_y, bins = (65,100), cmap = plt.cm.jet) 

plt.show()

**From the above histogram color map for PTB data marked as abnormal, you can infer that most of the ECG features is widely distributed in the range of 0 - 0.4 and there is no fixed pattern to this data**

In [None]:
plt.plot((abnormal.values)[0][5:70])
plt.show()

plt.plot((abnormal.values)[50][5:70])
plt.show()

plt.plot((abnormal.values)[117][5:70])
plt.show()

plt.plot((abnormal.values)[1111][5:70])
plt.show()

plt.plot((abnormal.values)[100][5:70])
plt.show()

**From the above, you can see that PTB data marked as abnormal donot have any fixed pattern in the data**

In [None]:
flatten_norm_y = normal.values
flatten_norm_y  = flatten_norm_y[:,5:70].flatten()

norm_x=np.arange(0,65)
norm_x = np.tile(norm_x, normal.shape[0])

plt.hist2d(norm_x,flatten_norm_y, bins=(65,100), cmap=plt.cm.jet)
plt.show()

**From the above histogram color map for PTB data marked as normal, you can infer that the graph of ECG features follow a standard bell shape and they peak in between the features 20-30**

In [None]:
plt.plot((normal.values)[0][5:70])
plt.show()

plt.plot((normal.values)[50][5:70])
plt.show()

plt.plot((normal.values)[117][5:70])
plt.show()

plt.plot((normal.values)[1111][5:70])
plt.show()

plt.plot((normal.values)[100][5:70])
plt.show()

**From the above, you can see that PTB data marked as normal has a fixed bell shape in the data and it peaks between the features 20-30 **

In [None]:
y_abnormal = np.ones((abnormal.shape[0]))
y_abnormal = pd.DataFrame(y_abnormal)

y_normal = np.zeros((normal.shape[0]))
y_normal = pd.DataFrame(y_normal)

X = pd.concat([abnormal, normal], sort=True)
y = pd.concat([y_abnormal, y_normal] ,sort=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(abnormal.dtypes, normal.dtypes)

In [None]:
abnormal.shape

In [None]:
normal.shape

**Check any of the features have a null**

In [None]:
np.any(X_train.isna().sum())

In [None]:
np.any(X_test.isna().sum())

***Evaluate a bunch of ML Models against the test data, we find that XGBoost performs the best***

In [None]:
seed=123

classifiers = [
    LogisticRegression(class_weight='balanced', random_state=seed),
    KNeighborsClassifier(3, n_jobs=-1),
    SVC(gamma='auto', class_weight='balanced', random_state=seed),
    RandomForestClassifier(random_state=seed, n_estimators = 1000),
    MLPClassifier(alpha=1, max_iter=1000),
    XGBClassifier(learning_rate =0.1,n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
                  colsample_bytree=0.8, objective= 'binary:logistic',nthread=4, scale_pos_weight=1,seed=seed)
]

names = ["Logistic", "Nearest Neighbors", "RBF SVM", "Random Forest", "Neural Net", "XGB"]

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.utils.validation import column_or_1d

for name, clf in zip(names, classifiers):
        
    y_train = column_or_1d(y_train, warn=True)
    clf.fit(X_train, y_train)
    print(f"{name}: {round(accuracy_score(y_test, clf.predict(X_test)),3)}")

In [None]:
clf = XGBClassifier(learning_rate =0.1,n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
                  colsample_bytree=0.8, objective= 'binary:logistic',nthread=4, scale_pos_weight=1,seed=seed)

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_pred = np.reshape(y_pred, (y_pred.shape[0],1))

y_pred.shape

In [None]:
y_pred = clf.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("f1:", f1)

**Display a feature importance graph. The top 18 features by importance are as follows:

*4  32 137  50  43 112  29 124 108 113 165  33  84 151  46 149  69  31**

In [None]:
# feature importance
from matplotlib import pyplot as plt

feature_imp = np.argsort(clf.feature_importances_)
print(np.flip(feature_imp))

# plot
plt.figure(figsize=(20,8))

plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
plt.show()