In [11]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.impute import KNNImputer

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'framingham.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [12]:
## Define input and target ##
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)
X_train.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3350.0,3374.0,3389.0
mean,49.613569,0.488201,236.771343,25.811417,75.91856
std,8.605408,0.499934,44.077935,4.109377,12.040951
min,32.0,0.0,107.0,15.54,44.0
25%,42.0,0.0,206.0,23.07,68.0
50%,49.0,0.0,234.0,25.395,75.0
75%,56.0,1.0,263.75,28.015,83.0
max,70.0,1.0,464.0,56.8,143.0


In [13]:
X_test.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,838.0,845.0,848.0
mean,49.470519,0.517689,236.522673,25.764438,75.720519
std,8.441986,0.499982,46.609144,3.963258,11.974824
min,33.0,0.0,133.0,16.59,45.0
25%,42.0,0.0,205.0,23.06,68.0
50%,49.0,1.0,233.5,25.41,75.0
75%,56.0,1.0,262.0,28.13,82.0
max,69.0,1.0,696.0,43.3,122.0


In [14]:
## Define Imputer ##
imputer = KNNImputer(n_neighbors = 5, weights = 'distance').fit(X_train)

## Impute Values ##
X_train_imp = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)
X_test_imp = pd.DataFrame(imputer.fit_transform(X_test), columns = X_test.columns)

In [15]:
X_train_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3390.0,3390.0,3390.0
mean,49.613569,0.488201,236.786441,25.81116,75.918174
std,8.605408,0.499934,43.891504,4.105606,12.039196
min,32.0,0.0,107.0,15.54,44.0
25%,42.0,0.0,206.0,23.08,68.0
50%,49.0,0.0,234.0,25.38,75.0
75%,56.0,1.0,263.0,27.9975,83.0
max,70.0,1.0,464.0,56.8,143.0


In [16]:
X_test_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,848.0,848.0,848.0
mean,49.470519,0.517689,236.454233,25.765871,75.720519
std,8.441986,0.499982,46.418678,3.957528,11.974824
min,33.0,0.0,133.0,16.59,45.0
25%,42.0,0.0,205.0,23.0675,68.0
50%,49.0,1.0,233.5,25.415,75.0
75%,56.0,1.0,262.0,28.1325,82.0
max,69.0,1.0,696.0,43.3,122.0


# Models

### Random Forest

In [17]:
## Model ##
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 5).fit(X_train_imp, Y_train)

## Prediction ##
rf_pred = rf_md.predict_proba(X_test_imp)[:, 1]

## Label ##
rf_label = np.where(rf_pred < .1, 0, 1)

print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.92      0.39      0.55       719
           1       0.19      0.82      0.31       129

    accuracy                           0.45       848
   macro avg       0.56      0.60      0.43       848
weighted avg       0.81      0.45      0.51       848



### Ada Boost

In [19]:
## Model ##
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5), n_estimators = 500, learning_rate = .01).fit(X_train_imp, Y_train)

## Prediction ##
ada_pred = ada_md.predict_proba(X_test_imp)[:, 1]

## Label ##
ada_label = np.where(ada_pred < .1, 0, 1)

print(classification_report(Y_test, ada_label))

              precision    recall  f1-score   support

           0       0.84      0.08      0.15       719
           1       0.15      0.91      0.26       129

    accuracy                           0.21       848
   macro avg       0.49      0.50      0.21       848
weighted avg       0.73      0.21      0.17       848



<dl>
    <dt>Summary</dt>
    <dd>Based on my results, the random forest classifier is best to predict Ten Year CHD</dd>
</dl>