In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier

from sklearn.metrics import SCORERS
from sklearn.metrics import plot_confusion_matrix,plot_roc_curve,classification_report,accuracy_score,confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-dataset-uci/HeartDiseaseTrain-Test.csv')

## Feature Seperation

In [None]:
Categorical_features = ['sex','chest_pain_type','fasting_blood_sugar',
                        'rest_ecg','exercise_induced_angina','slope',
                        'vessels_colored_by_flourosopy','thalassemia']

Numerical_features = ['age','resting_blood_pressure','cholestoral','Max_heart_rate','oldpeak']

In [None]:
df.info()

## EDA

In [None]:
df.isnull().sum()

In [None]:
# Lets check the whether data set is balanced or not !
sns.countplot(data=df,x='target')
# Conclusion Balanced

In [None]:
# Here i am checking whether age is normally distributed or not 
# so for this purpose i have done hypothesis testing

sns.histplot(data=df,x='age',kde=True)

In [None]:
stat , Pvalue = stats.shapiro(df['age'])
if Pvalue > 0.05:
    print("Normally distributed")
else:
    print("Not a normal distribution")

In [None]:
# QQ plot for checking normality
stats.probplot(df['age'],dist="norm", plot=plt)
plt.show()

In [None]:
# Chi Square testing to see effect of gender on target
contigency_data = pd.crosstab(df['sex'],df['target'])

stat,pvalue,dof,exp =stats.chi2_contingency(contigency_data)

print('stat=%.3f, p=%.3f' % (stat, pvalue))
if pvalue > 0.05:
    print('Same distribution no effect of sex on heart disease')
else:
    print('There is a effect of sex on heart disease')
    

In [None]:
sns.scatterplot(data=df,y='resting_blood_pressure',x='age',hue='target')

In [None]:
for i in Categorical_features:
    sns.countplot(data=df,x=i,hue='target')
    plt.show()
    
# In sex female had more heart disease than male 
# Cheast pain : if no pain means more likely no heart disease but still some had 
# fasting blood sugar both same same heart disease 
# if rest ecg 1 then more likely to have heart disease
# if any one had angina without excersice then they more likely had heart disease
# downslope = highly likely to have heart disease
# 0 colour means highly likely to have heart disease

In [None]:
for i in Numerical_features:
    sns.catplot(data=df,x='target',y=i,kind='box')

# Model Training and testing with Train-Test-Validation split

In [None]:
df = pd.get_dummies(df,drop_first=True)

In [None]:
X = df.drop('target',axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
X_test,X_validate,y_test,y_validate = train_test_split(X_test, y_test, test_size=0.50, random_state=42)



In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validate_scaled = scaler.transform(X_validate)

In [None]:
def trainModel(model,X_train,X_test,y_train,y_test):
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    print(classification_report(y_test,y_pred))
    print(plot_confusion_matrix(model,X_test,y_test))
    print(plot_roc_curve(model,X_test,y_test))
    
def trainModelGrid(model,X_train,X_test,y_train,y_test):
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(model.best_params_)
    print(classification_report(y_test,y_pred))
    print(plot_confusion_matrix(model,X_test,y_test))

In [None]:
model = LogisticRegression()
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
# Because In this I can accept more +ve than false -negative
model = LogisticRegression()
model.fit(X_train_scaled,y_train)
prob = model.predict_proba(X_test_scaled)[:,1]
y_pred = np.where(prob >=0.4,1,0) # Choosing custom thresold
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
model = SVC()
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
# After Running this I got C=50 
# See next cell
"""model = SVC()

param = {'C':[1.0,2,5,10,50,100],
    'kernel': ['rbf','linear'],
    'degree':[3,4],
    'gamma':['scale','auto']}

gridModel = GridSearchCV(model,param_grid=param,scoring='accuracy',cv=5,verbose=2)
trainModelGrid(gridModel,X_train_scaled,X_test_scaled,y_train,y_test)"""

In [None]:
model = SVC(C=50,probability=True)
model.fit(X_train_scaled,y_train)
prob = model.predict_proba(X_test_scaled)[:,1]
y_pred = np.where(prob >=0.4,1,0) # Choosing custom thresold
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
model = KNeighborsClassifier(1)
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
model = RandomForestClassifier()
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
model = AdaBoostClassifier()
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
model = GradientBoostingClassifier()
trainModel(model,X_train_scaled,X_test_scaled,y_train,y_test)

In [None]:
model = GradientBoostingClassifier(n_estimators=130,learning_rate=1)
model.fit(X_train_scaled,y_train)
prob = model.predict_proba(X_test_scaled)[:,1]
y_pred = np.where(prob >=0.3,1,0) # Choosing custom thresold
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
# Here I will Not tune model This is final accuracy of the model

model = RandomForestClassifier(n_estimators=120)
model.fit(X_train_scaled,y_train)
prob = model.predict_proba(X_validate_scaled)[:,1]

y_pred = np.where(prob >=0.4,1,0) # Choosing custom thresold

print(classification_report(y_validate,y_pred))
print(confusion_matrix(y_validate,y_pred))