# Overview

This is a quick first-pass to train a Machine Learning model. I simply picked a few metrics that stood out from EDA, scaled the values, trained a Logistic Regression model, and then exported the model. 

The Logistic Regression model did decent but I think with more iterations I can make it even better (feature engineering, explore other models, cross-validation, etc.). 

In [1]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

# Load the Data

In [2]:
data_train = pd.read_csv("../data/heart_train.csv")
data_test = pd.read_csv("../data/heart_test.csv")

# Preprocessing

In [3]:
# Age, MaxHR, Sex, ExerciseAngina
X_train = data_train[['Age', 'MaxHR', 'Sex', 'ExerciseAngina']].copy()
y_train = data_train[['HeartDisease']].copy()

X_test = data_test[['Age', 'MaxHR', 'Sex', 'ExerciseAngina']].copy()
y_test = data_test[['HeartDisease']].copy()

In [4]:

min_max_scaler = preprocessing.MinMaxScaler()

# Preprocess our Trainging Data
X_train[['Age', 'MaxHR']] = min_max_scaler.fit_transform(X_train[['Age', 'MaxHR']])
X_train['Sex'] = (X_train['Sex'] == 'M').replace({True: 1, False: 0})
X_train['ExerciseAngina'] = (X_train['ExerciseAngina'] == 'Y').replace({True: 1, False: 0})

# Preprocess our Testing Data
X_test[['Age', 'MaxHR']] = min_max_scaler.transform(X_test[['Age', 'MaxHR']]) # just transform, don't fit
X_test['Sex'] = (X_test['Sex'] == 'M').replace({True: 1, False: 0})
X_test['ExerciseAngina'] = (X_test['ExerciseAngina'] == 'Y').replace({True: 1, False: 0})

X_train.head()

Unnamed: 0,Age,MaxHR,Sex,ExerciseAngina
0,0.583333,0.424,1,0
1,0.604167,0.744,1,0
2,0.666667,0.248,1,1
3,0.479167,0.216,1,1
4,0.416667,0.84,1,0


In [5]:
X_train.describe()

Unnamed: 0,Age,MaxHR,Sex,ExerciseAngina
count,615.0,615.0,615.0,615.0
mean,0.512466,0.557737,0.769106,0.413008
std,0.199702,0.197985,0.421748,0.492775
min,0.0,0.0,0.0,0.0
25%,0.375,0.424,1.0,0.0
50%,0.520833,0.568,1.0,0.0
75%,0.65625,0.704,1.0,1.0
max,1.0,1.0,1.0,1.0


# Training / Machine Learning

In [6]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(solver='liblinear', random_state=42)
lr_classifier.fit(X_train, y_train)

lr_classifier.score(X_test, y_test)

0.7557755775577558

# Export Final Model and Scaler

In [7]:
import pickle
pickle.dump(lr_classifier, open("../models/production.sav", 'wb'))
pickle.dump(min_max_scaler, open("../models/min_max_scaler.sav", 'wb'))