In [1]:
import pandas as pd
import numpy as np
import boto3
from bs4 import BeautifulSoup
import requests

In [2]:
# load data from S3 bucket
def load_data_from_s3(bucket_name, file_key):
    s3 = boto3.client('s3',
                  aws_access_key_id='...',
                  aws_secret_access_key='...',
                  aws_session_token='...')
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    data = pd.read_csv(obj['Body'])
    return data

In [3]:
bucket_name = 'de300spring2024'
file_key = 'rachel_yao/heart_disease(in).csv'
data = load_data_from_s3(bucket_name, file_key)

In [4]:
# clean and impute data
def clean_and_impute_data(data):
    # 1
    retain = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 'smoke', 
                         'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 
                         'exang', 'oldpeak', 'slope', 'target']
    data = data[retain].copy()

    # 2
    # a. painloc and painexer: replace missing vals with most frequent val
    data['painloc'] = data['painloc'].fillna(data['painloc'].mode()[0])
    data['painexer'] = data['painexer'].fillna(data['painexer'].mode()[0])

    # b. trestbps: vals < 100 mm Hg with median
    data.loc[data['trestbps'] < 100, 'trestbps'] = data['trestbps'].median()
    data['trestbps'] = data['trestbps'].fillna(data['trestbps'].median())

    # c. oldpeak: vals < 0 and > 4 with median
    data.loc[(data['oldpeak'] < 0) | (data['oldpeak'] > 4), 'oldpeak'] = data['oldpeak'].median()
    data['oldpeak'] = data['oldpeak'].fillna(data['oldpeak'].median())

    # d. thaldur and thalach: replace missing vals with median
    data['thaldur'] = data['thaldur'].fillna(data['thaldur'].median())
    data['thalach'] = data['thalach'].fillna(data['thalach'].median())

    # e. fbs, prop, nitr, pro, diuretic: replace missing vals and vals > 1 with 0
    clean = ['fbs', 'prop', 'nitr', 'pro', 'diuretic']
    data[clean] = data[clean].fillna(0)
    data[clean] = data[clean].map(lambda x: 0 if x > 1 else x)

    # f. exang and slope: replace missing vals with mode
    data['exang'] = data['exang'].fillna(data['exang'].mode()[0])
    data['slope'] = data['slope'].fillna(data['slope'].mode()[0])

    data['sex'] = data['sex'].fillna(data['sex'].mode()[0])
    data['cp'] = data['cp'].fillna(data['cp'].mode()[0])  

    return data

In [5]:
data = clean_and_impute_data(data)

In [6]:
# scrape smoking rates by age from source 1
def scrape_smoking_rates_by_age(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')
    smoking_data = {}
    for table in tables[1]:
        rows = table.find_all('tr')
        for row in rows[1:]:
            ths = row.find_all('th')
            tds = row.find_all('td')
            age_range = ths[0].text.strip()
            smoking_rate = float(tds[9].text.strip())
            if 'and over' in age_range:
                min_age = int(age_range.split()[0])
                max_age = 120  # assuming 120 as an upper limit for age
            else:
                min_age, max_age = map(int, age_range.split('–'))
            for age in range(min_age, max_age + 1):
                smoking_data[age] = smoking_rate
    return smoking_data

In [7]:
# scrape smoking rates by age and sex from source 2
def scrape_smoking_rates_by_age_and_sex(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # sex
    cards = soup.find_all("div", class_="card-body")
    gender_data = {}
    for card in cards[2:3]:
        rows = card.find_all('li', class_='main')
        for row in rows:
            text = row.text.strip()
            gender = text.split()[6]
            rate = float(text.split()[7].strip('()%'))
            gender_data[gender] = rate
    # age
    age_data = {}
    for card in cards[3:4]:
        rows = card.find_all('li')
        for row in rows:
            text = row.text.strip()
            age_range = text.split()[7]
            if 'and older' in text:
                min_age = int(age_range.split()[0])
                max_age = 120  # assuming 120 as an upper limit for age
            else:
                min_age, max_age = map(int, age_range.split('–'))
                rate = float(text.split()[9].strip('()%'))
            for age in range(min_age, max_age + 1):
                age_data[age] = rate
    return gender_data, age_data

In [8]:
# source 1
source1 = 'https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking/latest-release'
smoking_source1 = scrape_smoking_rates_by_age(source1)

# source 2
source2 = 'https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm'
gender_data, age_data = scrape_smoking_rates_by_age_and_sex(source2)

In [9]:
# impute missing values in smoke column
def impute_smoke(data, smoking_source1, gender_data, age_data):
    # create separate columns for each source
    data['smoke_source1'] = np.nan
    data['smoke_source2'] = np.nan
    
    # impute missing values for source 1
    for i, row in data.iterrows():
        try:
            age = int(row['age']) 
            if age in smoking_source1: 
                data.at[i, 'smoke_source1'] = smoking_source1[age]
        except (ValueError, TypeError):
            continue
    data.dropna(subset=['smoke_source1'], inplace=True)
    
    # impute missing values for source 2
    for i, row in data.iterrows():
        age = int(row['age'])
        sex = int(row['sex'])
        if sex == 0:  # female
            data.at[i, 'smoke_source2'] = age_data[age]
        else:  # male
            data.at[i, 'smoke_source2'] = age_data[age] * (gender_data['men'] / gender_data['women'])
    data.dropna(subset=['smoke_source2'], inplace=True)

    data.drop(columns=['smoke'], inplace=True)
    return data

In [10]:
data = impute_smoke(data, smoking_source1, gender_data, age_data)
data.dropna(inplace=True)

In [19]:
# 3
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = data.drop(columns=['target'])
y = data['target']

# splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [24]:
# 4
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# define scoring metrics
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)}

# define classifiers and hyperparameters to tune
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'SVM': SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [25]:
# 5
# evaluate models using 5-fold cross-validation and report performance metrics
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"Model: {name}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print()

print("Gradient boosting seems to perform the best out of the models tested, with a high mean accuracy of 0.8048 and low standard deviation of 0.033.")

Model: Random Forest
Mean Accuracy: 0.8022
Standard Deviation: 0.0149

Model: Logistic Regression
Mean Accuracy: 0.7961
Standard Deviation: 0.0238

Model: SVM
Mean Accuracy: 0.7009
Standard Deviation: 0.0305

Model: Gradient Boosting
Mean Accuracy: 0.8048
Standard Deviation: 0.0330

Gradient boosting seems to perform the best out of the models tested, with a high mean accuracy of 0.8 and low standard deviation of 0.033.
