In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings("ignore")
import seaborn as sns
import scipy

In [None]:
# importing the dataset from kaggle datasets
dataset=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
# View first five rows
dataset.head()

In the given dataset, [sex,cp,fbs,restecg,exng,slp,caa,thall,output] this categorical variables are already encoded.

In below cell, i am taking only categorical variables for checking the value counts of each categorical variable.

From that, we clearly knows the distribution of each label in the variable. 

In [None]:
data=dataset.drop(['age','trtbps','chol','thalachh','oldpeak'],axis=1)
for i in data:
    print(i)
    
    print(dataset[i].value_counts())
    print("------------"*5)

from above data we observe that,[ sex,fbs,exng] features are having some imbalance between the labels. So there is a chance to get a bias problem.

In [None]:
dataset.head()

In [None]:
# Taking only Contineous variables
data1=dataset.drop(['sex','cp','fbs','restecg','exng','slp','caa','thall','output'],axis=1)

In [None]:
data1.describe()

In [None]:
# info of the dataset
dataset.info()

# Correlation

Checking correlation between the variables.

In [None]:

corrPearson = dataset.corr(method="pearson")
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrPearson,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("PEARSON")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

* From above visualisation we observe that there was no correlation between the independent variables.

# Missing values

In [None]:
# Checking missing values
dataset.isnull().sum()

 * From above we observe that there are no missing values in the data.

# outliers

In [None]:
figure = plt.figure(figsize=(13,8))

plt.boxplot(data1)

* From above boxplot Visualisation we clearly see that having outliers in the data.
* for removing outliers in below code i replaced the outliers with median.


In [None]:
for i in data1:
    if (i=='trtbps' or i=='chol' or i=='thalachh' or i=='oldpeak'):
        print(i)
        q1=data1[i].quantile(0.25)
        q3=data1[i].quantile(0.75)
        iqr=q3-q1
        low=q1-(0.5*iqr)
        high=q3+(0.5*iqr)
        data1[i][data1[i]<low]=data1[i].median()
        data1[i][data1[i]>high]=data1[i].median()
        

In [None]:
plt.boxplot(data1)

* from above boxplot we can checkout clearly, the outliers are replaced with median.

# vizualisations

In [None]:
# we can check all the histogram visualisation
figure=plt.figure(figsize=(10,8))
dataset.hist(figsize=(18,10))

In [None]:
# output distribution
sns.countplot('output',data=dataset)

* By obseving the above vizualisation, we decide that the data should be in normal distribution and balanced. 

In [None]:
sns.countplot(x='sex',hue='output',data=dataset)

* By above Visualisation, we obseve that mostly males are getting Heart Attack than Females.

In [None]:
sns.jointplot(data = dataset, x = 'age', y = 'chol', hue = 'output', palette='dark', height = 10, s = 100, alpha = 0.5)

* the above visualisation represents between age and chol with output.
* Mostly the age between 40-60 and chol between 200-270 persons are getting heart attack.

In [None]:
figure=plt.figure(figsize=(16,8))
sns.countplot(x='age',hue='output',data=dataset)

* Mostly the persons are having the age between (41-54) getting the heart attack. 

In [None]:
figure=plt.figure(figsize=(22,8))
sns.countplot(x='thalachh',hue='output',data=dataset)

In [None]:
dataset.groupby(['sex','output'])['chol'].mean()

* in above code we are grouping the sex and output with chol.
* we clearly see that, sex 0(female) and output 0(not get heart attack) peoples are having the average chol is 241.29
* sex 0(female) and output 1( get heart attack) peoples are having the average chol is 239.54
* sex 1(male) and output 0(not get heart attack) peoples are having the average chol is 239.73
*sex 1(male) and output 1(get heart attack) peoples are having the average chol is 236.13


* similarlly in below also with different variables 

In [None]:
dataset.groupby(['sex','output'])['thalachh'].mean()

In [None]:
dataset.groupby(['sex','output'])['age'].mean()

# Skewness

Calculating skewness of the variables

In [None]:
for i in dataset:
    print(i,' : ',scipy.stats.skew(dataset[i], axis=0, bias=True))
    

* we see that in dataset are having both positive and negative skewness.

In [None]:
# Dividing the dependent and independent variables.
# Droping columns 
y=dataset['output']
x=dataset.drop(['output','sex','fbs','exng'],axis=1)

In [None]:
# All independent variables
x.head()

# Transformations

* In below i preformed some retuning techniques 

In [None]:
x['age']=x['age'].apply(lambda x: np.log(x))
x['trtbps']=x['trtbps'].apply(lambda x: 1/x)
x['chol']=x['chol'].apply(lambda x: 1/(x))
x['thalachh']=x['thalachh'].apply(lambda x: (x))

In [None]:
x.head()

# scaling

* Here i applied MinMaxScaler, to bring the all variables between 0 to 1. 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
x=scale.fit_transform(x)
x=pd.DataFrame(x)

In [None]:
x.head()

# splitting

* Here splitting the data into training and testing. in this i taken 80% as training data and 20% as testing data Randomly selected. 

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict,GridSearchCV,StratifiedKFold
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=15,stratify=y)

# modeling

In [None]:
# importing the all classification models from libraries
from sklearn.tree import DecisionTreeClassifier   #importing model
from sklearn.neighbors import KNeighborsClassifier #import method
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix,precision_score,classification_report, plot_confusion_matrix


# In below, I build all classification models and finalised the best model from all models.

# XGBoost

In [None]:
classifier=XGBClassifier()
classifier.fit(x_train,y_train)
y_pre=classifier.predict(x_train)
y_pred=classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))


In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# Decision Tree

In [None]:

classifier = DecisionTreeClassifier(criterion = 'entropy') #creating algorithm
classifier.fit(x_train, y_train) #applying on model
y_pred = classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))



In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# KNN

In [None]:
classifier=KNeighborsClassifier(n_neighbors=50)     #create algorithm 
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test) #predicting model
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))


In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# Logistic regression

In [None]:
classifier=LogisticRegression(random_state=10)
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))


In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# Navie Bayes

In [None]:
classifier=GaussianNB()
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))

In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# Random Forest

In [None]:
classifier=RandomForestClassifier(n_estimators=1500,criterion='entropy')
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,verbose=False).mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))



In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# SVM

In [None]:
classifier=SVC(kernel='poly')
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
y_pre = classifier.predict(x_train)
print("validation confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("---------------"*8)
print("cross validation score")
print(cross_val_score(classifier,x_test,y_test,cv=10,scoring='accuracy').mean())
print("---------------"*8)
print("Validation Classification report ")
print(classification_report(y_test, y_pred))
print("---------------"*8)
print("Training Classification report ")
print(classification_report(y_train, y_pre))


In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 100)
plot_confusion_matrix(classifier, x_test, y_test, ax = ax);

# Final model

Support vector machine(SVM) is the final model because of having the high recall score compare to the other models. 
Recall Score= 94%