# World Health Organization has estimated 12 million deaths occur worldwide, every year due to Heart diseases.

The summary of this notebook:

* Feature Engineering.

* Relationship between education and cigsPerDay,

* Relationship between age and cigsPerDay, totChol, glucose.

* Which gender has more risk of coronary heart disease CHD.

* Which age group has more smokers.

* Relation between cigsPerDay and risk of coronary heart disease.

* Relation between sysBP and risk of CHD.

* Relation between diaBP and risk of CHD.

* Modelling.

In [None]:
#importing the necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
#reading the CSV file

location = "../input/heart-disease-prediction-using-logistic-regression/framingham.csv"
data = pd.read_csv(location)
data

# Feature Engineering

In [None]:
#Checking whether the dataset have any NaN.

data.isnull().sum()

In [None]:
data.dtypes

**Using KNNImputer to fill the null values.**

In [None]:
from sklearn.impute import KNNImputer

col_names = data.columns
for feature in col_names:
    data[feature+" nan"] = np.where(data[feature].isnull(),1,0)

imputer = KNNImputer(n_neighbors=3)
df = imputer.fit_transform(data)
df = pd.DataFrame(df, columns=data.columns)

df

In [None]:
#checking for NaN's 

df.isnull().sum()

**Removing the outliers.**

In [None]:
numeric_columns = df.select_dtypes(exclude="O")

for feature in numeric_columns:
    q1 = df[feature].quantile(0.05)
    q3 = df[feature].quantile(0.95)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    med = np.mean(df[feature])
    for i in df[feature]:
        if i > Upper_tail or i < Lower_tail:
                df[feature] = df[feature].replace(i, med)

Now our data is ready for further use: 

**Relationship between education and cigsPerDay**

In [None]:
#Grouping education and cigsPerDay

graph_1 = df.groupby("education", as_index=False).cigsPerDay.mean()

In [None]:

plt.figure(figsize=(12,8))
sns.regplot(x=graph_1["education"], y=graph_1["cigsPerDay"])
plt.title("Graph showing cigsPerDay in every level of education.")
plt.xlabel("education", size=20)
plt.ylabel("cigsPerDay", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

There is no such linear relationship found.
level 3 education shows the lowest mean.

**Relationship between age and cigsPerDay, totChol, glucose.**

In [None]:
#Plotting a linegraph to check the relationship between age and cigsPerDay, totChol, glucose.

graph_3 = df.groupby("age").cigsPerDay.mean()
graph_4 = df.groupby("age").totChol.mean()
graph_5 = df.groupby("age").glucose.mean()

plt.figure(figsize=(12,8))
sns.lineplot(data=graph_3, label="cigsPerDay")
sns.lineplot(data=graph_4, label="totChol")
sns.lineplot(data=graph_5, label="glucose")
plt.title("Graph showing totChol and cigsPerDay in every age group.")
plt.xlabel("age", size=20)
plt.ylabel("count", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

We see a minor relation between totChol and glucose.

**Which gender has more risk of coronary heart disease CHD**

In [None]:
#checking for which gender has more risk of coronary heart disease CHD

graph_6 = df.groupby("male", as_index=False).TenYearCHD.sum()

In [None]:
#Ploting the above values

plt.figure(figsize=(12,8))
sns.barplot(x=graph_6["male"], y=graph_6["TenYearCHD"])
plt.title("Graph showing which gender has more risk of coronary heart disease CHD")
plt.xlabel("0 is female and 1 is male",size=20)
plt.ylabel("total cases", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

According to this dataset, males have slighly higher risk of coronary heart disease CHD.

**Which age group has more smokers.**

In [None]:
#grouping the necessary features

graph_7 = df.groupby("age",as_index=False).currentSmoker.sum()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_7["age"], y=graph_7["currentSmoker"])
plt.title("Graph showing which age group has more smokers.")
plt.xlabel("age", size=20)
plt.ylabel("currentSmokers", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

Mid-age groups have more smokers

**Relation between cigsPerDay and risk of coronary heart disease.**

In [None]:
graph_8 = df.groupby("TenYearCHD", as_index=False).cigsPerDay.mean()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_8["TenYearCHD"], y=graph_8["cigsPerDay"])
plt.title("Graph showing the relation between cigsPerDay and risk of coronary heart disease.")
plt.xlabel("Rick of CHD", size=20)
plt.ylabel("cigsPerDay", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

High cigsPerDay comes with higher risk of CHD.

**Relation between sysBP and risk of CHD.**

In [None]:
# Grouping up the data and ploting it

graph_9 = df.groupby("TenYearCHD", as_index=False).sysBP.mean()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_9["TenYearCHD"], y=graph_9["sysBP"])
plt.title("Graph showing the relation between sysBP and risk of CHD")
plt.xlabel("Rick of CHD", size=20)
plt.ylabel("sysBP", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

Minor relation found between higher risk with higher sysBP  

**Relation between diaBP and risk of CHD**

In [None]:
# Grouping up the data and ploting it

graph_9 = df.groupby("TenYearCHD", as_index=False).diaBP.mean()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_9["TenYearCHD"], y=graph_9["diaBP"])
plt.title("Graph showing the relation between diaBP and risk of CHD")
plt.xlabel("Rick of CHD", size=20)
plt.ylabel("diaBP", size=20)
plt.xticks(size=12)
plt.yticks(size=12)

Minor relation found between higher risk with higher diaBP  

# Modelling

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# collecting the features in X
X = df.drop(columns=["TenYearCHD", "TenYearCHD nan"])

# y is the target variable (risk of CHD)
y = df["TenYearCHD"]

train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2)

model1 = LogisticRegression()
model2 = XGBClassifier()
model3 = RandomForestClassifier()

model1.fit(train_X, train_y)
model2.fit(train_X, train_y)
model3.fit(train_X, train_y)

score1 = model1.score(test_X, test_y)
score2 = model2.score(test_X, test_y)
score3 = model2.score(test_X, test_y)

print("accuracy of logistic regression is ", score1, "accuracy of xgboost is ", score2, "accuracy of random forest Clssifier is", score3)

**We will go with Logistic regression since we are getting better accuracy.**

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model1, random_state=1).fit(test_X, test_y)
eli5.show_weights(perm, feature_names = test_X.columns.tolist())

These are the most related features.

In [None]:
features = ["sysBP","age","prevalentHyp", "diaBP","cigsPerDay"] 

df = df[features]

new_train_x = df
new_train_y = y

Train_X, Test_X, Train_y, Test_y = train_test_split(new_train_x, new_train_y, test_size=0.2)

In [None]:
model = LogisticRegression()

params = {
    "max_iter"          : [30,40,50,100,150,200,],
    "random_state"      : [1,2,3,4,5,6],
    "n_jobs"            : [1,2,3,4,5],
    "penalty"           : ["l1", "l2", "elasticnet", "none"],
    "intercept_scaling" : [1,2,3,4,5],
    "solver"            : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "multi_class"       : ["auto", "ovr", "multinomial"],
    "verbose"           : [0,1,2,3,4,5]
    }

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, cv=5)

random_search.fit(Train_X, Train_y)

In [None]:
random_search.best_estimator_

In [None]:
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=2, l1_ratio=None, max_iter=30,
                   multi_class='ovr', n_jobs=4, penalty='none', random_state=3,
                   solver='newton-cg', tol=0.0001, verbose=4, warm_start=False)

model.fit(Train_X, Train_y)

score = model.score(Test_X, Test_y)

print("The accuracy of our model is ", score)