In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from statistics import mean

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

Shape of dataframe

In [None]:
df.shape

Let's see if dataframe is having any null values or not

In [None]:
df.isnull().sum()

so no null values

Now let's check what statistical data says

In [None]:
df.describe()

Now let's check how much multicollinearity exists among columns

In [None]:
df.corr()

As we see just numbers are hard to get insights so we'll add heatmap over it

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True)

The lighter the color the more positive correlation, and the darker the color the more negative correlation is visible among columns

But we here want to predict 'outcome' column in df so we will check just correlation of 'outcome' with other columns

In [None]:
df.corr()['output'].sort_values(ascending=False)

As we can see there is not much correlation with 'outcome' we will try plotting pairplot to get more idea

In [None]:
sns.pairplot(df, hue= 'output')

As we already had idea from correlation much realtion is not visible, and there is not much of analysis so we can move onto prediction part.

For that we'll have a look at dataframe again

In [None]:
df.head(10)

As we can see data is not scaled properly, so we will choose columns to scale

In [None]:
for i in df:
    print("Column "+str(i)+" is having unique values " +str(df[i].nunique())+ " with max and min as "+str(df[i].max())+" and "+str(df[i].min()))

So from above it seems the columns which need scaling are age, trtbps, chol, thalach, oldpeak

In [None]:
columns_to_scale = df.iloc[:,[0,3,4,7,9,]]
columns_to_scale

In [None]:
ss = StandardScaler()
scaled_values = ss.fit_transform(columns_to_scale)
scaled_values = pd.DataFrame(scaled_values, columns=columns_to_scale.columns)
scaled_values

Now we will concatenate scaled_values with remaining part of df

In [None]:
scaled_df = pd.concat([scaled_values,df.iloc[:,[1,2,5,6,8,10,11,12,13]]],axis=1)
scaled_df

Now as we have scaled our values we will divide our rows in train and test dataframe for prediction.

Y will be 'output' and rest are X variables

In [None]:
Y = df.iloc[:,[-1]]
X = df.iloc[:,:-1]

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.2,random_state=42)

For classification prediction we will 1st use **Logistic Regression**

In [None]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
model_lr = lr.fit(xtrain,ytrain.values.ravel())
predict_values_lr = model_lr.predict(xtest)
predict_values_lr

Now let's see what's precision and accuracy

In [None]:
print(accuracy_score(ytest,predict_values_lr))
print(recall_score(ytest,predict_values_lr))

We have got accuracy of 0.88, we will check what other models have to provide

So next is **Decision Tree Classifier**

In [None]:
dtc = DecisionTreeClassifier(random_state=42,criterion="entropy")
model_dtc = dtc.fit(xtrain,ytrain)
predict_values_dtc = model_dtc.predict(xtest)
predict_values_dtc

In [None]:
print(accuracy_score(ytest,predict_values_dtc))
print(recall_score(ytest,predict_values_dtc))

The accuracy has been decreased so we will try chnaging **min_samples_leaf** parameter of **DecisionTreeClassifier**

For that we will use **Grid Search method**

In [None]:
values_sample_leaf_dtc = {"min_samples_leaf":range(2,240,1)}
cv = GridSearchCV(dtc,values_sample_leaf_dtc,scoring="accuracy",cv=3)
model_cv = cv.fit(xtrain,ytrain)
model_cv.best_params_

In [None]:
dtc_2 = DecisionTreeClassifier(random_state=42,criterion="entropy",min_samples_leaf=7)
model_dtc_2 = dtc_2.fit(xtrain,ytrain)
predict_values_dtc_2 = model_dtc_2.predict(xtest)
predict_values_dtc_2

In [None]:
print(accuracy_score(ytest,predict_values_dtc_2))
print(recall_score(ytest,predict_values_dtc_2))

Not much improvement from LR model but still recall is increased from previous DTC model

Now let's try with **Random Forest Classifier**

In [None]:
rfc = RandomForestClassifier(random_state=42,n_estimators=30)
model_rfc = rfc.fit(xtrain,ytrain.values.ravel())
predict_values_rfc = model_rfc.predict(xtest)
predict_values_rfc

In [None]:
print(accuracy_score(ytest,predict_values_rfc))
print(recall_score(ytest,predict_values_rfc))

Not much improvement, let's try with grid search

In [None]:
values_sample_leaf_rfc = {"min_samples_leaf":range(2,240,1)}
cv = GridSearchCV(rfc,values_sample_leaf_rfc,scoring="accuracy",cv=3)
model_cv = cv.fit(xtrain,ytrain.values.ravel())
model_cv.best_params_

In [None]:
rfc_2 = RandomForestClassifier(random_state=42,n_estimators=30, min_samples_leaf=16)
model_rfc_2 = rfc_2.fit(xtrain,ytrain.values.ravel())
predict_values_rfc_2 = model_rfc_2.predict(xtest)
predict_values_rfc_2

In [None]:
print(accuracy_score(ytest,predict_values_rfc_2))
print(recall_score(ytest,predict_values_rfc_2))

Exactly same result as LR model

We will try K Neighbors model

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
model_knn = knn.fit(xtrain,ytrain.values.ravel())
predict_values_knn = model_knn.predict(xtest)
predict_values_knn
#accuracy_score(ytest,predict_values_knn)

In [None]:
print(accuracy_score(ytest,predict_values_knn))
print(recall_score(ytest,predict_values_knn))

Seems it's not working too

Trying KNN with different leaf_size in Grid Search CV

In [None]:
knn_2 = KNeighborsClassifier()
values_n_neigh_knn = {"leaf_size":range(1,100,1)}
cv = GridSearchCV(knn_2,values_n_neigh_knn,scoring="accuracy",cv=3)
model_cv = cv.fit(xtrain,ytrain.values.ravel())
model_cv.best_params_

Trying with different n_neighbors value and Plotting KNN leaf value with accuracy

In [None]:
acc = []
nei = []
for i in range(1,20,1):
    knn = KNeighborsClassifier(n_neighbors=i)
    model_knn = knn.fit(xtrain,ytrain.values.ravel())
    predict_values_knn = model_knn.predict(xtest)
    nei.append(i)
    acc.append(accuracy_score(ytest,predict_values_knn))

In [None]:
plt.plot(nei, acc)

In [None]:
knn = KNeighborsClassifier(n_neighbors=11, weights = 'distance')
model_knn = knn.fit(xtrain,ytrain.values.ravel())
predict_values_knn = model_knn.predict(xtest)
predict_values_knn

In [None]:
print(accuracy_score(ytest,predict_values_knn))
print(recall_score(ytest,predict_values_knn))

Seems Logistic Regression model is best suiting, let's see how it works for different **Random State**

In [None]:
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
rs = []
acc = []
for i in range(1,100,1):
    xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.2,random_state=i)
    model_lr_rs = lr.fit(xtrain,ytrain.values.ravel())
    predict_values_lr_rs = model_lr_rs.predict(xtest)
    acc.append(accuracy_score(ytest,predict_values_lr_rs))
    rs.append(i)

In [None]:
plt.plot(rs, acc)

In [None]:
for i in range(len(rs)):
    print(rs[i],acc[i])

So accuracy the best prediction for Heart Attack chances is **93.4%.** But with different random state we should rely more on average of all accuracies.

In [None]:
print("The average accuracy is "+str(round(mean(acc)*100,2))+"% with best as 93.4%")

**That was a short notebook on how different classification model works**