# World Health Organization has estimated 12 million deaths occur worldwide, every year due to Heart diseases.

The summary of this notebook:

* Data cleaning.

* Relationship between education and cigsPerDay,

* Relationship between age and cigsPerDay, totChol, glucose.

* Which gender has more risk of coronary heart disease CHD.

* Which age group has more smokers.

* Relation between cigsPerDay and risk of coronary heart disease.

* Relation between sysBP and risk of CHD.

* Relation between diaBP and risk of CHD.

* Predicting the risk of CHD with Linear Regression.(85% accuracy)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/heart-disease-prediction-using-logistic-regression/framingham.csv")
df

# Data cleaning

In [None]:
#checking whether the dataset has missing values as NaN
df.isnull().any()

In [None]:
df['education'].describe()

In [None]:
#filling the null values with mean = 1.9 ~ 2
df['education']= df['education'].fillna(2)

In [None]:
df['cigsPerDay'].describe()

In [None]:
df['cigsPerDay'] = df['cigsPerDay'].fillna(df['cigsPerDay'].mean())

In [None]:
df['BPMeds'].describe()

In [None]:
df['BPMeds'] = df['BPMeds'].fillna(df['BPMeds'].mean())

In [None]:
df['totChol'].describe()

In [None]:
df['totChol']= df['totChol'].fillna(df['totChol'].mean())

In [None]:
df['BMI'].describe()

In [None]:
df['BMI']= df['BMI'].fillna(df['BMI'].mean())

In [None]:
df['heartRate'].describe()

In [None]:
df['heartRate'] = df['heartRate'].fillna(df['heartRate'].mean())

In [None]:
df['glucose'].describe()

In [None]:
df['glucose'] = df['glucose'].fillna(df['glucose'].mean())

In [None]:
df.isnull().any()

Now our data is ready for further use: 

# Relationship between education and cigsPerDay

There is no such linear relationship found.
level 3 education shows the lowest mean.

# Relationship between age and cigsPerDay, totChol, glucose.

In [None]:
line1 = df.groupby('age').cigsPerDay.mean()
line2 = df.groupby('age').totChol.mean()
line3 = df.groupby('age').glucose.mean()

plt.figure(figsize=(10,8))
sns.lineplot(data = line1, label= 'cigsPerDay')
sns.lineplot(data = line2, label= 'totChol')
sns.lineplot(data = line3, label= 'glucose')
plt.title('Cigrattes_per_day, Total Cholestrol and Glucose in every age group')
plt.xlabel('Age', size =15)
plt.ylabel('Count', size=15)
plt.xticks(size=12)
plt.yticks(size=12)

We see a minor relation between totChol and glucose.

# Which gender has more risk of coronary heart disease CHD

In [None]:
#checking for which gender has more risk of coronary heart disease CHD

graph= df.groupby("male", as_index=False).TenYearCHD.sum()
graph

In [None]:
plt.figure(figsize=(10,6))
sns.set(style="whitegrid")
sns.barplot(x=graph['male'], y= graph['TenYearCHD'], palette="Blues_d")
plt.title('Risk of Coronary Heart Disease (CHD) in males and females')
plt.xlabel('Female  Male', size =15)
plt.ylabel('TenYearCHD', size=15)
plt.xticks(size=12)
plt.yticks(size=12)



According to this dataset, males have slighly higher risk of coronary heart disease CHD.

# Which age group has more smokers.

In [None]:
#checking for which age group has more risk of coronary heart disease CHD

graph1= df.groupby("age", as_index=False).currentSmoker.sum()
graph1

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=graph1['age'], y=graph1['currentSmoker'])
plt.title('Age group having more smokers')
plt.xlabel('Age', size=15)
plt.ylabel('Total Smokers' , size=15)
plt.xticks(size=12)
plt.yticks(size=12)

Mid-age groups have more smokers

# Relation between cigsPerDay and risk of coronary heart disease.

In [None]:
graph_2 = df.groupby("TenYearCHD", as_index=False).cigsPerDay.mean()

plt.figure(figsize=(10,6))
sns.barplot(x=graph_2["TenYearCHD"], y=graph_2["cigsPerDay"], palette="rocket")
plt.title("Relation between cigsPerDay and risk of coronary heart disease.")
plt.xlabel("Risk of CHD", size=15)
plt.ylabel("cigsPerDay", size=15)
plt.xticks(size=12)
plt.yticks(size=12)

High cigsPerDay comes with higher risk of CHD.

# Relation between sysBP and risk of CHD.

In [None]:
# Grouping up the data and ploting it

graph_3 = df.groupby("TenYearCHD", as_index=False).sysBP.mean()

plt.figure(figsize=(10,6))
sns.barplot(x=graph_3["TenYearCHD"], y=graph_3["sysBP"], palette="vlag")
plt.title("Graph showing the relation between sysBP and risk of CHD")
plt.xlabel("Risk of CHD", size=15)
plt.ylabel("sysBP", size=15)
plt.xticks(size=12)
plt.yticks(size=12)

Minor relation found between higher risk with higher sysBP  

# Relation between diaBP and risk of CHD

In [None]:
# Grouping up the data and ploting it

graph_4 = df.groupby("TenYearCHD", as_index=False).diaBP.mean()

plt.figure(figsize=(8,6))
sns.barplot(x=graph_4["TenYearCHD"], y=graph_4["diaBP"], palette="deep")
plt.title("Graph showing the relation between diaBP and risk of CHD")
plt.xlabel("Risk of CHD", size=15)
plt.ylabel("diaBP", size=15)
plt.xticks(size=12)
plt.yticks(size=12)

Minor relation found between higher risk with higher diaBP  

# Predicting the risk of CHD with Logistic Regression. (85% accuracy)

In [None]:
X_train=df.drop(columns='TenYearCHD')
Y_train=df['TenYearCHD']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# defining the model

logreg = LogisticRegression(random_state=3, max_iter=1000)

In [None]:
X_train, x_test, Y_train, y_test = train_test_split(X_train, Y_train, test_size=0.5)

In [None]:
#fitting the model

logreg.fit(X_train, Y_train)

In [None]:
y_pred = logreg.predict(x_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'
      .format(logreg.score(x_test, y_test)))

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))