In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sklearn
# import pandas_profiling
# import sweetviz as sv
# import dtale as dtale
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
df_heart_original = pd.read_csv('./Dataset/heart.csv')

In [None]:
df_o2_saturation = pd.read_csv('./Dataset/o2Saturation.csv')

In [None]:
## Description
# Age - Age of the person
# Sex - Gender of the person
# cp - Chest Pain type 
# trtbps - Resting Blood pressure in mm Hg
# chol - Cholestrol in mg/df fetched via BMI sensor
# fbs - Fasting blood sugar > 120 (in mg/dl)(1 - true ; 0 - false)
# restecg - resting electrocardiographic results
# thalachh - Maximum heart rate achieved
# exng - exercise induced angnia (1 - true ; 0 - false)
# oldpeak - previous peak
# slp - Slope
# caa - number of major vessels
# thall - Thal rate
# Output - Target variable

In [None]:
#Creating ID column
# df_heart_original = df_heart_original.assign(id=[*range(0,len(df_heart_original))])
#Renaming column names
df_heart_original.columns = ['age','sex','chest_pain','bp','chol','fbs','rest_ecg','max_heart_rate','exer_ang','old_peak','slope','no_vessels','thal_rate','output']

In [None]:
#Moving ID column to top
cols = list(df_heart_original.columns.values)
# cols.pop(cols.index('id'))
# df_heart_original = df_heart_original[['id']+cols] 

In [None]:
df_heart_original.head()

# EDA

In [None]:
# Pandas Profiling
# profile = pandas_profiling.ProfileReport(train_data)
# profile

In [None]:
# Sweetviz
# my_report = sv.analyze(train_data)
# my_report.show_html()

In [None]:
# D-tale
# d_tale_op = dtale.show(train_data)
# d_tale_op.open_browser()

# Bucketing

In [None]:
#1-Normal
#2-Mildly High
#3-High
#4-Very_High
df_heart_original['chol_level'] = pd.cut(x=df_heart_original['chol'],bins=[0,149,199,499,1000],labels=[1,2,3,4])
df_heart_original.chol_level = df_heart_original.chol_level.astype('int64')

In [None]:
#1-Normal
#2-Elevated
#3-High
df_heart_original['bp_level'] = pd.cut(x=df_heart_original['bp'],bins=[0,119,129,1000],labels=[1,2,3])
df_heart_original.bp_level = df_heart_original.bp_level.astype('int64')

In [None]:
def heart_rate_ok(age,max_heart_rate):
    if(max_heart_rate < (220-age)):
        return 1
    else: return 0

def map_impl(df):
  return pd.Series(
    map(heart_rate_ok,
      df_heart_original['age'] ,
      df_heart_original['max_heart_rate'])
  )

In [None]:
df_heart_original['max_heart_rate_ok'] = map_impl(df_heart_original)

In [None]:
train_data,test_data = sklearn.model_selection.train_test_split(df_heart_original,test_size=0.20)
Y_col = ['output']
X_col = [i for i in df_heart_original if i not in Y_col]

X_train = train_data[X_col]
Y_train = train_data[Y_col]

X_test = test_data[X_col]
Y_test = test_data[Y_col]

## Feature Selection 

In [None]:
#Forward feature Selection
logreg = LogisticRegression(solver='lbfgs', max_iter=100000)
rfe = RFE(logreg, n_features_to_select = 5)
rfe = rfe.fit(X_train, Y_train.values.ravel())
# print(rfe.support_)
# print(rfe.ranking_)

X_train = X_train.loc[:,rfe.support_]
X_test = X_test.loc[:,rfe.support_]
print(X_train.columns)

## Modelling

In [None]:
#Model Statistics
import statsmodels.api as sm
logit_model=sm.Logit(Y_train,X_train)
result=logit_model.fit()
print(result.summary2())

In [None]:
#Training Model
logistic_reg_model = LogisticRegression(random_state=0)
logistic_reg_model.fit(X_train,Y_train.values.ravel())
Y_pred = logistic_reg_model.predict(X_test)
Y_pred

In [None]:
logistic_reg_model.score(X_test,Y_test)

In [None]:
ax = sns.heatmap(confusion_matrix(Y_test,Y_pred,labels=[1, 0]), annot=True, fmt="d")

In [None]:
print(classification_report(Y_test,Y_pred, target_names=['0','1']))