<a href="https://colab.research.google.com/github/sanhiitaa/heart-health-prediction-classifier/blob/main/heart_health_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data manipulation libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# data processing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# model building libraries
from sklearn.linear_model import LogisticRegression

# metrics libraries
from sklearn.metrics import classification_report

# setting randomseed
np.random.seed(1111)

# Pipeline and transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# pickle for exporting the pipeline
import pickle

In [2]:
# loading the dataframe
df=pd.read_csv('/content/drive/MyDrive/datasets/heart_disease_health_indicators_BRFSS2015.csv')
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [9]:
# dropping duplicates
df=df.drop_duplicates()

In [10]:
# splitting data into dependent and independent data
x=df.iloc[:,1:]
y=df.iloc[:,0:1]

In [11]:
# splitting data in train and test
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.4)

In [39]:
# test_input for pipeline testing in the next notebook
num=np.random.randint(1000) # random number for index value
a=x_train.iloc[num,1:]
{x : [value] for x ,value in a.items()}

{'HighChol': [0],
 'CholCheck': [1],
 'BMI': [26],
 'Smoker': [0],
 'Stroke': [0],
 'Diabetes': [0],
 'PhysActivity': [1],
 'Fruits': [0],
 'Veggies': [0],
 'HvyAlcoholConsump': [0],
 'AnyHealthcare': [1],
 'NoDocbcCost': [0],
 'GenHlth': [1],
 'MentHlth': [0],
 'PhysHlth': [0],
 'DiffWalk': [0],
 'Sex': [1],
 'Age': [6],
 'Education': [5],
 'Income': [8]}

In [43]:
y_train.iloc[num]

HeartDiseaseorAttack    0
Name: 242830, dtype: int64

In [15]:
# column transformer for scaling

coltoscale=['BMI','GenHlth','PhysHlth','Age', 'Income', 'Education']

scaling=ColumnTransformer([
    ('scale', StandardScaler(), coltoscale)
])

In [16]:
# creating an instance of logistic regression
logistic_regression= LogisticRegression()

In [17]:
# defining a pipleine
pipeline = Pipeline([
    ('processing', scaling),
    ('classification', logistic_regression)
])

In [18]:
# fitting the training data on the pipeline
pipeline.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
# making predictions
y_pred=pipeline.predict(x_test)

In [31]:
# classification report
classification_report(y_test, y_pred, output_dict=True)['0']

{'precision': 0.9003868046856113,
 'recall': 0.9940674059156092,
 'f1-score': 0.9449108563784393,
 'support': 82426}

In [32]:
# accuracy
classification_report(y_test, y_pred, output_dict=True)['accuracy']

0.8960538770358927

# exporting the pipeline


In [14]:
pickle.dump(pipeline, open('heart-health-prediction-model.pkl', 'wb'))