In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve,auc,roc_auc_score
from warnings import filterwarnings

In [None]:
heart=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
filterwarnings("ignore")

In [None]:
heart.describe()

In [None]:
heart.isnull().sum()

# Count Duplicate Enteries

In [None]:
heart.duplicated().sum()

# Remove Duplicate enteries

In [None]:
heart.drop_duplicates(inplace=True)

Let us look at the number of males and females in the dataset

In [None]:
sns.countplot("sex",data=heart)

Types of chest pain

In [None]:
sns.countplot("cp",data=heart)

So there are basically four types of chest pain. 

In [None]:
sns.countplot("fbs",data=heart)

We can see very few persons are suffered from high blood sugar problem.

In [None]:
sns.countplot("restecg",data=heart)

#  Resting electrocardiographic results showing probable or definite left ventricular hypertrophy by Estes' criteria is negligible we can remove these enteries.

In [None]:
value_count=heart["restecg"].value_counts()
to_remove=value_count[value_count<5].index
heart["restecg"].replace(to_remove,np.nan,inplace=True)
heart.dropna(how="any",axis=0,inplace=True)


In [None]:
sns.countplot("restecg",data=heart)

In [None]:
sns.countplot("exng",data=heart)

# angina due to exercise is seen in only 1/3rd of the people indicating that exercise is not the major cause of angina.

In [None]:
sns.countplot("slp",data=heart)


# Number of Major Vessels

In [None]:
sns.countplot("caa",data=heart)

# the persons having maximum number of major vessels equal to 4 are only 5 which is a very small number and can be removed


In [None]:
value_count=heart["caa"].value_counts()
to_remove=value_count[value_count<6].index
heart["caa"].replace(to_remove,np.nan,inplace=True)
heart.dropna(how="any",axis=0,inplace=True)

# Thall Rate

In [None]:
sns.countplot("thall",data=heart)

# the persons having thall rate 0 are only and can be removed from the data

In [None]:
value_count=heart["thall"].value_counts()
to_remove=value_count[value_count<5].index
heart["thall"].replace(to_remove,np.nan,inplace=True)
heart.dropna(how="any",axis=0,inplace=True)

# I have removed all the data which is extremely less and did not contribute much in the analysis

In [None]:
label=["less chance","more chance"]
df=pd.crosstab(columns=heart["sex"],index=heart["output"],normalize="columns")
df.columns=["female", "male"]
df.index=["",""]
df.plot.pie(subplots=True,explode=(0.05,0),autopct='%1.1f%%', shadow=True, startangle=140, legend=False, figsize=(11, 6))
plt.legend(label, bbox_to_anchor=(1,0.5), loc="center right",fontsize=12,bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.8)
plt.title("Percentage of  Females and Males having less or more chance of heart attack", fontsize=16, fontname="Times New Roman",weight="bold")

# We can see that approximately 76% of the total females and 44.5% of total males have more chances of heart attack.

In [None]:
label=["less chance","more chance"]
explode=(0.05,0)
df=pd.crosstab(index=heart["output"],columns=heart["sex"],normalize="all")
df.columns=["female", "male"]
df.index=["",""]
df.plot.pie(subplots=True,explode=explode,autopct='%1.1f%%', shadow=True, startangle=140, legend=False, figsize=(11, 6))
plt.legend(label, bbox_to_anchor=(1,0.5), loc="center right",fontsize=12,bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.8)
plt.title("Percentage of Females and Males having less or more chance of heart attack", fontsize=16, fontname="Times New Roman",weight="bold")

# if we look at total population, then approximately 24% females and 30.5% males of total population have heart attack. This is because number of males is more than that number of females

In [None]:
sns.catplot(x="output",y="age",kind="swarm",hue="sex", data=heart)

# Most of the heart attack cases are reported in both  males and females of age above 40 years.

# we can conclude that females are more vulnerable to heart attack as compared males

In [None]:
sns.catplot(x="output",y="trtbps",hue="sex",kind="swarm",data=heart)

# the blood pressure level is  between 120 and 140 in most of the heart attack cases reported.


In [None]:
sns.catplot(x="output",y="chol",hue="sex",kind="swarm",data=heart)

# We can see only few data points having chol value greater than 420. We can remove these data points as they are outliers.

In [None]:
heart.drop(heart[heart["chol"]>420].index,axis=0,inplace=True)

# We can also use boxplot to identify the outliers. Now let us again plot the output vs chol

In [None]:
sns.catplot(x="output",y="chol",hue="sex",kind="swarm",data=heart)

# cholestrol level is  between 200 and 300 in most of the heart attack cases reported.

In [None]:
sns.catplot(x="output",y="thalachh",hue="sex",kind="swarm",data=heart)

# only few of the heart attack cases have been reported with a low value of maximum heart rate achieved. In most of the cases, the maximum heart rate is above 140.

In [None]:
heart.drop(heart[heart["oldpeak"]>4.5].index,axis=0,inplace=True)
sns.catplot(x="output",y="oldpeak",hue="sex",kind="swarm",data=heart) 

# It can be clearly seen that in approximately 99% cases of heart attack reported the old peak is below 2.

In [None]:
label=["less chance","more chance"]
df=pd.crosstab(columns=heart["cp"],index=heart["output"],normalize="columns")
df.columns=["typical angina", "atypical angina","non-anginal pain","asymptomatic"]
df.index=["",""]
df.plot.pie(subplots=True,explode=(0.05,0),autopct='%1.1f%%', shadow=True, startangle=140, legend=False, figsize=(11, 6))
plt.legend(label, bbox_to_anchor=(1,0.5), loc="center right",fontsize=12,bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.8)
plt.title("Effect of chest pain on heart attack", fontsize=16, fontname="Times New Roman",weight="bold")

# the persons having chest pain of type  typical angina are less vulnerable to heart attack as compared to other types of chest pain.
# If a person having chest pain other than typical angina then there will be 69.6% to 81.6% chance of heart attack.

In [None]:
pd.crosstab(index=[heart["sex"],heart["cp"]],columns=heart["output"],normalize="index")*100

# Again if we go by male and female then a female having any of the 4 types of chest pains will be more vulnerable to heart attack than males. If a  female have chest pain then 48.6% to 100% chance that this pain will be followed by a heart attack and if the pain is of any type other than typical  angina then chances of heart attack will be 88.8% to 100%. On the other hand in case of males. there is 20.7% to 77.4 % chance that
# a chest pain followed by a heart attack. if the pain is of any type other than typical 
# angina then chances of heart attack will be 63.1% to 77.4%. 

In [None]:
pd.crosstab(index=[heart["sex"],heart["fbs"]],columns=heart["output"],normalize="index")

# A female having low  and  high blood sugar level will 
# have 80% and 50% chance respectively of heart attack 
# On the other hand, A male having low  and  high blood sugar level will 
# have 42.60% and 54.8% chance respectively of heart attack
# low blood sugar level is a serious problem in case of female and this may 
# result in heart attack

In [None]:
label=["less chance","more chance"]
df=pd.crosstab(columns=heart["restecg"],index=heart["output"],normalize="columns")
df.columns=["normal","ST-T wave abnormality"]
df.index=["",""]
df.plot.pie(subplots=True,explode=(0.05,0),autopct='%1.1f%%', shadow=True, startangle=140, legend=False, figsize=(11, 6))
plt.legend(label, bbox_to_anchor=(1,0.5), loc="center right",fontsize=12,bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.8)
plt.title("Electrocardiographic results analysis on heart attack", fontsize=16, fontname="Times New Roman",weight="bold")

# A person having  ST-T wave abnormality will have 63% chance of heart attack
# that means if  resting electrocardiographic shows ST-T wave abnormality
# then chances of heart attack will be more.

In [None]:
pd.crosstab(index=[heart["sex"],heart["restecg"]],columns=heart["output"],normalize="index")*100

# In case of females ST-T wave abnormality increases the chances of
# heart to 81.6% while in case of males it increases the chances to 53.6%


In [None]:
label=["less chance","more chance"]
df=pd.crosstab(columns=heart["caa"],index=heart["output"],normalize="columns")
df.columns=["0 vessel","1 vessel", "2 vessels","3 vessels"]
df.index=["",""]
df.plot.pie(subplots=True,explode=(0.05,0),autopct='%1.1f%%', shadow=True, startangle=140, legend=False, figsize=(11, 6))
plt.legend(label, bbox_to_anchor=(1,0.5), loc="center right",fontsize=12,bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.8)
plt.title("Role of number of major vessels on heart attack", fontsize=16, fontname="Times New Roman",weight="bold")

# A person having less vessels will have more chances of heart attack. The
# chances of heart attack decreases from 75.1% to 16.7% with increase in
# number of major vessels from 0 to 3

In [None]:
pd.crosstab(index=[heart["sex"],heart["caa"]],columns=heart["output"],normalize="index")

# A similar trend can be observed in case of both male and female
# lesser the number of vessels more is the chances of heart attack.

In [None]:
pd.crosstab(index=[heart["sex"],heart["thall"]],columns=heart["output"],normalize="index")

# when thall rate achieved is 2 the chances of heart attack will
# be more. However, when the thall rate is 1 and 3 the chances
# of heart attack are less.

# There is also corrleation between different factors such as chest pain,
# blood sugar, cholestrol etc. which helps us in analysing the actual
# cause of the heart attack.

In [None]:
df=heart.drop(["sex","cp","fbs","caa","slp","restecg","exng","thall","output","oldpeak"],axis=1,inplace=False)
corrmat=df.corr()
sns.heatmap(corrmat,annot=True)

# We can with age cholestrol and blood pressure increases while maximmum heart rate decreases.
# cholestrol and blood pressure have almost negligible effect on maximum  heart rate.
# cholestrol and blood pressure has correlation factor only 0.12. So they are 
# positively realted to each other but does not effect each other significantly

 
# Conclusion: If a 40 year or more person having (i) chest pain other than
# typical angina (ii) low blood sugar level (iii) number of major vessels 
# is 2 or 3 (iv) maximum heart rate 2 will be extremely vulnerable to 
# heart attack. Vulnerablility will be more in case of females than males.

# Regression Analysis

In [None]:
x=heart.drop("output",axis=1, inplace=False)
x=x.values
y=heart["output"].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)
LR=LogisticRegression(solver="lbfgs",class_weight="balanced",max_iter=10000)
model=LR.fit(x_train,y_train)
prediction=LR.predict(x_test)
print("Confusion matrix")
print(confusion_matrix(y_test,prediction))

In [None]:
print("Classification Report")
print(classification_report(y_test,prediction))

In [None]:
R2=cross_val_score(model,x_test,y_test,cv=10,verbose=False).mean()
error=-cross_val_score(model,x_test,y_test,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
print(R2.round(3))
print(np.sqrt(error).round(3))

# Better predictions can be made than this results if dataset is large.

In [None]:
P=LR.predict_proba(x_test)[:,1]
fpr,tpr,threshold=roc_curve(y_test,P)
plt.figure()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],"r--")
plt.xlim(0,1)
plt.ylim(0,1.05)
plt.xlabel("False Positive Rate",fontsize=12)
plt.ylabel("True Positive Rate",fontsize=12)
plt.show()

In [None]:
area=auc(fpr,tpr)
print("Area under the curve is")
print(area.round(3))