In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn 

In [None]:
data=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
## No Null values found, So we can proceed to EDA part

 ## **EXPLANATORY DATA ANALYSIS**

# ***Univariate Analysis***

In [None]:
# distribution of target variable 

data.target.value_counts()


In [None]:
print("Patients with Heart Disease : ",round((165/303)*100,2))

print("Patients Not with Heart Disease : ",round((138/303)*100,2))






In [None]:
sn.boxplot(data.trestbps)

In [None]:
sn.distplot(data.chol)

In [None]:
data[data.trestbps==200.00]

In [None]:
sn.boxplot(data.thalach)

In [None]:
data.sex.value_counts()

> **Imbalance :** No. of Males are almost 2x of No. of Females

# *Bivariate Analysis*

In [None]:
pd.crosstab(data.cp,data.target).plot(kind="bar",figsize=(20,6))
plt.title('CP and No of Patient with heart disease')
plt.xlabel('CP')
plt.ylabel('Frequency')
plt.show()

> People with chest pain 0 are less prone to having a Heart disease

In [None]:
pd.crosstab(data.sex,data.target).plot(kind="bar",figsize=(20,6))
plt.title('Sex and No of Patient with heart disease')
plt.xlabel('Sex')
plt.ylabel('Frequency')
plt.show()

> Women are more likely to have a heart disease than Men, if we compare the ratio of Target(1):Target(0) for individuals sexes 

In [None]:
sn.regplot(data.age,data.trestbps)

> The above plot shows a positive correlation between Resting BP & Age.  That is, Resting BP increases with Age 

In [None]:
sn.barplot(x='ca',y='target',data=data)

In [None]:
data[data.ca==4]

> As 'ca' is supposed to be between 0-3 & 4 seems to be a value imputed by mistake, by looking at the target average for this, it seems to be 0. Therefore replae ca= 4 with ca = 0 

In [None]:
data.ca=data.ca.replace(4,0)

In [None]:
pd.crosstab(data.ca,data.target).plot(kind="bar",figsize=(20,6))
plt.title('CA and No of Patient with heart disease')
plt.xlabel('CA')
plt.ylabel('Frequency')
plt.show()

> Insight : People with higher CA are less likely to have a heart disease 

In [None]:
pd.crosstab(data.age,data.target).plot(kind="bar",figsize=(20,6))
plt.title('Age and No of Patient with heart disease')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

> People above the age of 40 are more likely to have a heart disease. 

In [None]:
sn.regplot(data.age,data.thalach)

> Insight: Heart Rate Decrease with Age 

# Predictive Modelling 

In [None]:
corr_matrix=data.corr()
plt.figure(figsize=(15,6))
sn.heatmap(corr_matrix ,
           annot = True,
            linewidth = 0.5,
            fmt = ".2f",
            cmap = "YlGnBu")

All variables seem significant

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas import DataFrame
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn import metrics

In [None]:
variables=['sex','cp','fbs','restecg','exang','thal','ca','slope']

for i in variables:
    data[i]=data[i].astype('category')

In [None]:
data.head()

In [None]:
data=pd.get_dummies(data)

In [None]:
x = data.drop(['target'], axis = 1)
y = data['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 5)

In [None]:
feature_scaler = MinMaxScaler()
x_train = feature_scaler.fit_transform(x_train)
x_test = feature_scaler.transform(x_test)

In [None]:
radm_clf = RandomForestClassifier()

parameters = {'n_estimators': [150,175,200,225,250,300,325,350,375,400],'criterion': ['gini','entropy'],'max_features':['auto','sqrt','log2']}

clf = GridSearchCV(radm_clf, parameters, scoring='roc_auc' ,cv =5)
clf.fit(x_train, y_train)

clf.best_score_

In [None]:
clf.best_params_

In [None]:
model=RandomForestClassifier(criterion = 'gini',max_features = 'log2',n_estimators = 150)

In [None]:
model.fit(x_train,y_train)

In [None]:
accuracy = model.score(x_test,y_test)
print(accuracy)

In [None]:
import scikitplot as skplt
pred=model.predict(x_test)
matrix6 = (y_test,pred)
skplt.metrics.plot_confusion_matrix(y_test ,pred ,figsize=(10,5))

In [None]:
from sklearn.metrics import classification_report

print("Testing Accuracy :", model.score(x_test, y_test))
cr = classification_report(y_test, pred)
print(cr)

#  *Accuracy of the model = 91.80%*