In [None]:
# 1. c) LOOCV
# 2. c) high bias low variance
# 3. b) low bias high variance
# 4. Regularization reduces variance by shrinking some of the parameters to almost 0


In [1]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'framingham.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
heart = heart.dropna()

X1 = heart[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
X2 = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']
F1 = list()
F2 = list()
kfold = KFold(n_splits = 5, shuffle = True)

for train_idx, test_idx in kfold.split(X1):
    X1_train, X1_val = X1.iloc[train_idx], X1.iloc[test_idx]
    X2_train, X2_val = X2.iloc[train_idx], X2.iloc[test_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    
    scaler = MinMaxScaler()
    X1_train = scaler.fit_transform(X1_train)
    X1_val = scaler.fit_transform(X1_val)
    X2_train = scaler.fit_transform(X2_train)
    X2_val = scaler.fit_transform(X2_val)
    
    logit_md1 = LogisticRegression().fit(X1_train, Y_train)
    logit_md2 = LogisticRegression().fit(X2_train, Y_train)
    
    logit_pred1 = logit_md1.predict_proba(X1_val)[:, 1]
    logit_pred2 = logit_md2.predict_proba(X2_val)[:, 1]
    
    logit_label1 = np.where(logit_pred1 < .25, 0, 1)
    logit_label2 = np.where(logit_pred2 < .25, 0, 1)
     
    F1.append(f1_score(Y_val, logit_label1, average='weighted'))
    F2.append(f1_score(Y_val, logit_label2, average='weighted'))

F_score1 = np.mean(F1)
F_score2 = np.mean(F2)

print('The F1-score of model 1 is', F_score1)
print('The F1-score of model 2 is', F_score2)

The F1-score of model 1 is 0.7626333183261893
The F1-score of model 2 is 0.7676995082072852


In [None]:
# The first model gave a higher F-score so it is the better model