In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np 

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt


s3 = boto3.resource('s3')
bucket_name = "rachaeld-data445"
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the data-file
heart = pd.read_csv(file_content_stream)
heart.head()

Matplotlib is building the font cache; this may take a moment.


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# removing observations with NA
heart = heart.dropna()

In [7]:
importance = list()

for i in range (0,100):
    #defining input and target variables (X = all but target)
    X = heart.drop(columns =  ['TenYearCHD'], axis= 1)
    Y = heart['TenYearCHD']
    
    #splitting the data 
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = .2)
    
    #building the model
    RF_md = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)
    
    #extracting the importances
    importance.append(RF_md.feature_importances_)


In [8]:
importances = pd.DataFrame(importance)
importances.columns = [['male', 'age', 'education', 'currentSmoker', 'cigsPerDay',
                       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
                       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
importances.apply(np.mean, axis = 0)

male               0.021223
age                0.124265
education          0.036751
currentSmoker      0.012547
cigsPerDay         0.050343
BPMeds             0.007008
prevalentStroke    0.003425
prevalentHyp       0.018486
diabetes           0.006779
totChol            0.121414
sysBP              0.134949
diaBP              0.118834
BMI                0.127789
heartRate          0.095906
glucose            0.120280
dtype: float64

In [10]:
#lists to store results 
md1res = list()
md2res = list()
md3res = list()

#using top 5- sysBP, BMI, Age, totChol, glucose
X = heart[['sysBP', 'BMI', 'age', 'totChol', 'glucose']]
Y = heart['TenYearCHD']

for i in range (0,100):
    #splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = .2)

### Model 1 ###
    #building the model
    RF_md1 = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)
    #predicting on test
    RF_pred1 = RF_md1.predict_proba(X_test)[:, 1]
    #changing liklihoods to labels
    RF_labels1 = np.where(RF_pred1 < .1, 0, 1)
    #computing the recall score
    RF_recall1 = recall_score(Y_test, RF_labels1)
    md1res.append(RF_recall1)
      
### Model 2 ###
    #building the model
    RF_md2 = RandomForestClassifier(max_depth = 5, n_estimators = 500).fit(X_train, Y_train)
    #predicting on test
    RF_pred2 = RF_md2.predict_proba(X_test)[:, 1]
    #changing liklihoods to labels
    RF_labels2 = np.where(RF_pred2 < .1, 0, 1)
    #computing the recall score
    RF_recall2 = recall_score(Y_test, RF_labels2)
    md2res.append(RF_recall2)
    
### Model 3 ###
    #building the model
    RF_md3 = RandomForestClassifier(max_depth = 7, n_estimators = 500).fit(X_train, Y_train)
    #predicting on test
    RF_pred3 = RF_md3.predict_proba(X_test)[:, 1]
    #changing liklihoods to labels
    RF_labels3 = np.where(RF_pred3 < .1, 0, 1)
    #computing the recall score
    RF_recall3 = recall_score(Y_test, RF_labels3)
    md3res.append(RF_recall3)
    
print('The average recall score of the first model is:', np.mean(md1res))
print('The average recall score of the second model is:', np.mean(md2res))
print('The average recall score of the third model is:', np.mean(md3res))

The average recall score of the first model is: 0.8445935832928773
The average recall score of the second model is: 0.8285562975324495
The average recall score of the third model is: 0.8149035558156125


In [None]:
## based on the above results we would want to use the first model, with a maximum depth of 3, to predict cor