In [None]:
# 1, a) In an AdaBoost model, all input variables are given the same initial weight. The initial weight is  equal to 1/M. In this example, the initial weight is .01
#    b) On the next weak learner, observations that were missclassified will be assigned a greater weight than observations that were classified correctly. Thus the 72nd 
    ### observation will have a greater weight on the second learner than it did on the first.

# 2. Ensemble learning algorithms create a group of weak learners and choose the best one for each new observation. This is exactly what AdaBoost.M1 does. This modeling 
   # approach uses weights to create each weak learner in sequence. 

# 3. See attached file. 

# 4. If the AdaBoost ensemble underfits the data, try using a higher learning rate. 

# 5. a)

# 6. d)

# 7. The main difference between Gradient Boosting and AdaBoost is their loss function. AdaBoost uses an exponential loss function while Gradient Boosting can use any loss function. 

In [1]:
import boto3
import pandas as pd
import numpy as np
import statistics as st

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'framingham.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
heart = pd.read_csv(file_content_stream)
heart = heart.dropna()

X = heart[['age', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

rf_recall = list()
ada_recall = list()
gb_recall = list()

rf_accuracy = list()
ada_accuracy = list()
gb_accuracy = list()

heart.head()



Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


# Models

In [2]:
for i in range(0,5):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)
    
    ## Random Forest Model ##
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
    rf_pred = rf_md.predict_proba(X_test)[:, 1]
    rf_label = np.where(rf_pred < .1, 0, 1)
    rf_recall.append(recall_score(Y_test, rf_label))
    rf_accuracy.append(accuracy_score(Y_test, rf_label))

    ## Ada Boost Model ##
    ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)
    ada_pred = ada_md.predict_proba(X_test)[:, 1]
    ada_label = np.where(ada_pred < .1, 0, 1)
    ada_recall.append(recall_score(Y_test, ada_label))
    ada_accuracy.append(accuracy_score(Y_test, ada_label))

    ## Gradient Boost Model ##
    gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)
    gb_pred = gb_md.predict_proba(X_test)[:, 1]
    gb_label = np.where(gb_pred < .1, 0, 1)
    gb_recall.append(recall_score(Y_test, gb_label))
    gb_accuracy.append(accuracy_score(Y_test, gb_label))
    
pd.DataFrame({'Model': ['Random Forest', 'Ada Boost', 'Gradient Boost'],
              'Average Recall': [np.mean(rf_recall),
                                 np.mean(ada_recall),
                                 np.mean(gb_recall)],
              'Average Accuracy': [np.mean(rf_accuracy),
                                   np.mean(ada_accuracy),
                                   np.mean(gb_accuracy)]})

Unnamed: 0,Model,Average Recall,Average Accuracy
0,Random Forest,0.85,0.453825
1,Ada Boost,0.978571,0.154645
2,Gradient Boost,0.803571,0.522678


In [None]:
# My classifiers did not meet the requirement. To address this, I would first adjust the learning rate.