Our goal is to predict whether the patient has 10-year risk of future coronary heart disease (CHD).The dataset provides the patients’ information. It includes over 4,000 records and 15 attributes.

Varibles:
* Sex: male or female(Nominal)
* Age: Age of the patient;
* Current Smoker: whether or not the patient is a current smoker (Nominal)
* Cigs Per Day: the number of cigarettes that the person smoked on average in one day.
* BP Meds: whether or not the patient was on blood pressure medication (Nominal)
* Prevalent Stroke: whether or not the patient had previously had a stroke (Nominal)
* Prevalent Hyp: whether or not the patient was hypertensive (Nominal)
* Diabetes: whether or not the patient had diabetes (Nominal)
* Tot Chol: total cholesterol level (Continuous)
* Sys BP: systolic blood pressure (Continuous)
* Dia BP: diastolic blood pressure (Continuous)
* BMI: Body Mass Index (Continuous)
* Heart Rate: heart rate (Continuous )
* Glucose: glucose level (Continuous)
* 10 year risk of coronary heart disease CHD (binary: “1”, means “Yes”, “0” means “No”) (Target Variable)

IMPORTING MODULES

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

READING DATA

In [None]:
heart=pd.read_csv("../input/heart-disease-prediction-using-logistic-regression/framingham.csv")
heart.head()

In [None]:
heart.tail()

In [None]:
heart.describe()

Renaming the column 'male' to 'sex' for convinience

In [None]:
heart.rename(columns={"male":"sex"},inplace=True)

In [None]:
heart.shape

DROPPING NULL VALUES

In [None]:
heart.isnull().sum()

In [None]:
heart=heart.dropna(axis=0)
heart.shape

In [None]:
heart.info()

In [None]:
heartcopy=heart.copy()

Changing data type to 'categorical' for nominal variables for better visualisation

In [None]:
categorical=["sex","education","currentSmoker","BPMeds","prevalentStroke","prevalentHyp","diabetes","TenYearCHD"]
heartcopy[categorical] = heartcopy[categorical].apply(lambda x: x.astype('category'), axis = 0)
heartcopy.info()

In [None]:
heartcopy['sex']=heartcopy['sex'].replace({0:'Female',1:'Male'})
heartcopy['currentSmoker']=heartcopy['currentSmoker'].replace({0:'No',1:'Yes'})
heartcopy['BPMeds']=heartcopy['BPMeds'].replace({0:'No',1:'Yes'})
heartcopy['prevalentStroke']=heartcopy['prevalentStroke'].replace({0:'No',1:'Yes'})
heartcopy['prevalentHyp']=heartcopy['prevalentHyp'].replace({0:'No',1:'Yes'})
heartcopy['diabetes']=heartcopy['diabetes'].replace({0:'No',1:'Yes'})
heartcopy['TenYearCHD']=heartcopy['TenYearCHD'].replace({0:'No',1:'Yes'})
heartcopy.head()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('')

sns.countplot(ax=axes[0,0], x="sex", hue="TenYearCHD", data=heartcopy)
axes[0,0].set_title('Sex')

sns.countplot(ax=axes[0,1], x="currentSmoker", hue="TenYearCHD", data=heartcopy)
axes[0,1].set_title('Current Smoker')

sns.countplot(ax=axes[0,2], x="BPMeds", hue="TenYearCHD", data=heartcopy)
axes[0,2].set_title('BP Meds')

sns.countplot(ax=axes[1,0], x="prevalentStroke", hue="TenYearCHD", data=heartcopy)
axes[1,0].set_title('Prevalent Stroke')

sns.countplot(ax=axes[1,1], x="prevalentHyp", hue="TenYearCHD", data=heartcopy)
axes[1,1].set_title('Prevalent Hyp')

sns.countplot(ax=axes[1,2], x="diabetes", hue="TenYearCHD", data=heartcopy)
axes[1,2].set_title('Diabetes')

In [None]:
feature_name=["sex","age","education","currentSmoker","cigsPerDay","BPMeds","prevalentStroke","prevalentHyp","diabetes","totChol","sysBP","diaBP","BMI","heartRate","glucose"]

In [None]:
for i in feature_name:
    heart[i]=heart[i]/heart[i].std()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(heart[feature_name].values,heart["TenYearCHD"].values,test_size=0.25)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression().fit(x_train,y_train)

In [None]:
score=model.score(x_train,y_train)*100
print(score)

In [None]:
w0=model.intercept_[0]
w=w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15=model.coef_[0]
equation="y=%f+(%f*x1)+(%f*x2)+(%f*x3)+(%f*x4)+(%f*x5)+(%f*x6)+(%f*x7)+(%f*x8)+(%f*x9)+(%f*x10)+(%f*x11)+(%f*x12)+(%f*x13)+(%f*x14)+(%f*x15)"%(w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15)
print(equation)

In [None]:
import math
feature_importance=pd.DataFrame(feature_name,columns=["Features"])
feature_importance["Importance"]=pow(math.e,w)
feature_importance=feature_importance.sort_values(by=["Importance"])
print(feature_importance)

In [None]:
plt.barh(feature_importance["Features"][:15],feature_importance["Importance"][:15])

In [None]:
print(model.score(x_test,y_test)*100)

In [None]:
new_features=["prevalentStroke","prevalentHyp","BPMeds","currentSmoker","totChol","glucose","cigsPerDay","sex","sysBP","age"]

In [None]:
for i in new_features:
    heart[i]=heart[i]/heart[i].std()

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(heart[new_features].values,heart["TenYearCHD"],test_size=0.2)
new_model=LogisticRegression().fit(X_train,Y_train)
new_model.score(X_train,Y_train)*100

In [None]:
k0=new_model.intercept_[0]
k=k1,k2,k3,k4,k5,k6,k7,k8,k9,k10=new_model.coef_[0]
equation="y=%f+(%f*x1)+(%f*x2)+(%f*x3)+(%f*x4)+(%f*x5)+(%f*x6)+(%f*x7)+(%f*x8)+(%f*x9)+(%f*x10)"%(k0,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10)
print(equation)

In [None]:
new_model.score(X_test,Y_test)*100