In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC 
import xgboost as xgb
from scipy import stats

Getting the Dataset 

In [None]:
patient_data = pd.read_csv('../input/heart-disease-uci/heart.csv')

**Print first few rows of the Data**

In [None]:
patient_data.head()

**Print last few rows of the Data**

In [None]:
patient_data.tail()

**Getting information about the Data**

In [None]:
patient_data.info()

**Checking for any missing values**

In [None]:
patient_data.isnull().sum()

In [None]:
patient_data.describe()

In [None]:
patient_data.columns = ['age', 'sex', 'pain_type', 'blood_pressure_value', 'cholesterol', 'blood_sugar_value', 'rest_ecg', 'max_heart_rate',
       'exercise_induced_angina', 'st_depression', 'st_slope','calcium_heart_score','thalassemia_value','target']

In [None]:
patient_data.head()

In [None]:

patient_data['pain_type'].value_counts()

In [None]:
patient_data['st_slope'].value_counts()


In [None]:
patient_data.drop(patient_data[patient_data.st_slope ==0].index, inplace=True)
patient_data['st_slope'].value_counts()

**EDA**

In [None]:
patient_data.shape

In [None]:
patient_data.describe(include =[np.number])

**Distribution of Age and Gender**

In [None]:
plt.figure(figsize=(18,12))
plt.subplot(221)
patient_data["sex"].value_counts().plot.pie(autopct = "%1.0f%%",colors = sns.color_palette("prism",4),startangle = 60,labels=["Male","Female"],
wedgeprops={"linewidth":2,"edgecolor":"k"},explode=[.1,.1],shadow =True)
plt.title("Distribution of Gender")
plt.subplot(222)
ax= sns.displot(patient_data['age'], rug=True)
plt.title("Age wise distribution")
plt.show()

In [None]:
attr_1=patient_data[patient_data['target']==1]

attr_0=patient_data[patient_data['target']==0]


fig = plt.figure(figsize=(20,10))
ax1 = plt.subplot2grid((1,2),(0,0))
sns.distplot(attr_0['age'])
plt.title('AGE DISTRIBUTION OF NORMAL PATIENTS', fontsize=20, weight='bold')

ax1 = plt.subplot2grid((1,2),(0,1))
sns.countplot(attr_0['sex'], palette='viridis')
plt.title('GENDER DISTRIBUTION OF NORMAL PATIENTS', fontsize=20, weight='bold' )
plt.show()



fig = plt.figure(figsize=(20,10))
ax1 = plt.subplot2grid((1,2),(0,0))
sns.distplot(attr_1['age'])
plt.title('AGE DISTRIBUTION OF HEART DISEASE PATIENTS', fontsize=20, weight='bold')

ax1 = plt.subplot2grid((1,2),(0,1))
sns.countplot(attr_1['sex'], palette='viridis')
plt.title('GENDER DISTRIBUTION OF HEART DISEASE PATIENTS', fontsize=20, weight='bold' )

***Chest pain type****

In [None]:
fig = plt.figure(figsize=(20,10))
ax1 = plt.subplot2grid((1,2),(0,0))
sns.countplot(attr_0['pain_type'])
plt.title('CHEST PAIN OF NORMAL PATIENTS', fontsize=20, weight='bold')


ax1 = plt.subplot2grid((1,2),(0,1))
sns.countplot(attr_1['pain_type'], palette='Accent')
plt.title('CHEST PAIN OF HEART PATIENTS', fontsize=20, weight='bold' )
plt.show()

**Distribution of ECG**

In [None]:


fig = plt.figure(figsize=(20,10))
ax1 = plt.subplot2grid((1,2),(0,0))
sns.countplot(attr_0['rest_ecg'])
plt.title('ECG OF NORMAL PATIENTS', fontsize=20, weight='bold')


ax1 = plt.subplot2grid((1,2),(0,1))
sns.countplot(attr_1['rest_ecg'], palette='viridis')
plt.title('ECG OF HEART PATIENTS', fontsize=20, weight='bold' )
plt.show()

In [None]:
patient_data_numeric = patient_data[['age','blood_pressure_value','cholesterol','max_heart_rate']]

In [None]:

patient_data_numeric.head()

**Calculating zeros**

In [None]:
a = np.abs(stats.zscore(patient_data_numeric))
print(a)
threshold = 3
print(np.where(a > 3))

**Encoding**

In [None]:
patient_data = pd.get_dummies(patient_data, drop_first=True)

patient_data.head()

In [None]:
X = patient_data.drop(['target'],axis=1)
y = patient_data['target']
X.corrwith(y).plot.bar(
        figsize = (16, 4), title = "Correlation with Diabetes", fontsize = 15,
        rot = 90, grid = True)

**Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2,shuffle=True, random_state=5)


In [None]:
print('Distribution of traget variable in training set:')
print(y_train.value_counts())

print('Distribution of traget variable in test set')
print(y_test.value_counts())

**Model Developing**

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train,y_train)
y_pred_mlp = mlp.predict(X_test)

In [None]:
rf_ent = RandomForestClassifier(criterion='entropy',n_estimators=100)
rf_ent.fit(X_train, y_train)
y_pred_rfe = rf_ent.predict(X_test)