In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading data

In [None]:
df=pd.read_csv('../input/heart-disease-data/heart_disease_uci.csv')
df.head()

# Short Summary about the data and data types

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

%matplotlib inline

In [None]:
df.isnull().sum()

In [None]:
#percentage of missing values
df.isnull().sum()/len(df)*100

In [None]:
df.describe()

In [None]:
df.info()

# Data Visualiztion and Handling Missing Values

In [None]:
sns.set_style('whitegrid')

In [None]:
sns.displot(x='age',data=df,hue='sex')

In [None]:
sns.displot(x='sex',data=df,hue='num')

In [None]:
sns.boxplot(data=df,x='ca',y='trestbps')

In [None]:
pd.crosstab(df['cp'],df['ca'])

In [None]:
sns.kdeplot(x='trestbps',data=df,hue='ca')

In [None]:
sns.kdeplot(x='oldpeak',data= df,hue='ca')

In [None]:
sns.scatterplot(x='chol',y='trestbps',data=df,hue='ca',size='oldpeak')

In [None]:
# A copy of original dataset so we don't lose the original info
df_copy= df.copy()

In [None]:
# Since we didn't get any feature which could help us to fill values of 'ca', we drop it
df_copy.drop('ca',axis=1,inplace=True)

In [None]:
sns.scatterplot(x='age',y='chol',data=df)

In [None]:
print(df[df['trestbps']!=0]['trestbps'].mean())
print(df[df['chol']!=0]['chol'].mean())

In [None]:
# Replacing 0 cholestrol and trestbps value with mean value
df_copy['trestbps']=df_copy['trestbps'].replace(0,132)
df_copy['chol']=df_copy['chol'].replace(0,247)

In [None]:
sns.scatterplot(y='chol',x='age',data=df_copy,hue='slope')

In [None]:
sns.scatterplot(y='chol',x='age',data=df_copy,hue='slope')

In [None]:
sns.scatterplot(y='chol',x='age',data=df_copy,hue='thal')

In [None]:
# Since we didn't get any feature which could help us to fill values of 'slope' and 'thal', we drop it

df_copy.drop(['thal','slope'],axis=1,inplace=True)

In [None]:
#rest all are less than 10% so we can drop the rest null values

df_copy.dropna(inplace=True)

# Data Preprocessing

In [None]:
df_copy['cp']=df_copy['cp'].replace(['typical angina', 'atypical angina', 'non-anginal', 'asymptomatic'],[0,1,2,3])
df_copy['restecg'].replace(['normal', 'st-t abnormality', 'lv hypertrophy'],[0,1,2],inplace=True)
df_copy['exang'].replace([True,False],[1,0],inplace=True)

In [None]:
df_copy['fbs'].replace([True,False],[1,0],inplace=True)
df_copy['sex'].replace(['Male','Female'],[1,0],inplace=True)
df_copy

In [None]:
df_copy.drop(['id','dataset'],axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
X,y= df_copy.drop('num',axis=1),df_copy['num']


In [None]:
# Changing all the numerical values into standard values (Z= (X-mean)/std)
X[['age','trestbps','chol','thalch','oldpeak']]= (X[['age','trestbps','chol','thalch','oldpeak']]-X[['age','trestbps','chol','thalch','oldpeak']].mean())/X[['age','trestbps','chol','thalch','oldpeak']].std()
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=101)

# Model Selection

Since the prediction values are non-binary values, we select KNearestNeighbours and Multi-layer Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
errors=[]
K=np.arange(1,10)
for k in K:
    model= KNeighborsClassifier(k)
    model.fit(X_train,y_train)
    pred= model.predict(X_test)
    error_rate= np.mean(pred!=y_test)
    errors.append(error_rate)

plt.plot(K,errors)
plt.xlabel('Number of Neighbours, K')
plt.ylabel('error_rate')



In [None]:
#Thererfore, we get least error with K=5

model= model= KNeighborsClassifier(5)
model.fit(X_train,y_train)
pred= model.predict(X_test)
error_rate= np.mean(pred!=y_test)
print(error_rate)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,15,8))
mlp_classifier.fit(X_train,y_train)

In [None]:
predict2=mlp_classifier.predict(X_test)
error_rate2= np.mean(predict2!=y_test)
print(error_rate2)

### We can see although the MLPClassifier doesn't converge, yet it has better than KNearesNeighbours