# Modeling

In [None]:
!pip install -U pandas pandas-profiling scikit-learn sagemaker

## Load the train data

In [None]:
import pandas as pd

train_df = pd.read_csv(
    "...", # TODO: Paste the S3 path to your train.csv
    nrows=100
)

test_df = pd.read_csv("...") # TODO: Paste the S3 path to your test.csv


print(train_df.shape)
train_df.head()

## Exploratory Data Analysis

In [None]:
from pandas_profiling import ProfileReport

In [None]:
profile = ProfileReport(train_df)
profile.to_file('profile_report.html')

## Split Features and Response

In [None]:
# Define the columns
cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

In [None]:
# Split X(features) and y(response)
X_train = train_df.drop("target", axis=1)
y_train = train_df["target"]

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

## Data Transformations

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
# One hot encode the categorical columns
ohe = OneHotEncoder(drop="first")

# Scale the continuous columns
sc = StandardScaler()

# Column transformer to apply transformations on both categorical and continuous columns
ct = ColumnTransformer([
    ("One Hot Encoding", ohe, cat_cols),
    ("Scaling", sc, cont_cols)
])

## ML Model
- Random Forest documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

## Pipeline to combine feature engineering and ML model

In [None]:
# Sklearn pipeline
from sklearn.pipeline import Pipeline

pipeline_rfc_model = Pipeline([
    ("Data Transformations", ct),
    ("Random Forest Model", rfc)
])

## Fit the Pipeline Model locally
- We run it locally to ensure there are no bugs in the code!
- For this "test" purpose we can just run it on a smaller subset of the data

In [None]:
# To view tha Pipeline model as a diagram
from sklearn import set_config
set_config(display="diagram")

In [None]:
# Fit the model locally on a smaller subset of data
pipeline_rfc_model.fit(X_train, y_train)

In [None]:
# Check the accuracy on training data
train_accuracy = pipeline_rfc_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Check the accuracy on test data
test_accuracy = pipeline_rfc_model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.4f}")

## Fit the Pipeline Model on Sagemaker!
- Since the model is free from bugs, we can train it on the full dataset.
- Sagemaker training allows us to scale training to large datasets.
- First we need to put all the code into a .py script
- Sagemaker API documentation: https://sagemaker.readthedocs.io/en/stable/api/index.html

In [None]:
%%writefile train.py

import argparse
import os
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

model_file_name = "pipeline_model.joblib"

# Main function
def main():
    # Arguments
    parser = argparse.ArgumentParser()
    
    # Inbuilt Arguments: https://github.com/aws/sagemaker-containers#id11
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    
    # Custom Arguments
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--min_samples_split", type=float, default=0.05)
    parser.add_argument("--criterion", type=str, default="gini")
    
    args, _ = parser.parse_known_args()
    
    
    # Load data
    train_df = pd.read_csv("...") # TODO: Paste the S3 path to your train.csv
    test_df = pd.read_csv("...") # TODO: Paste the S3 path to your test.csv

    # Define the columns
    cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
    cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

    # Split X(features) and y(response)
    X_train = train_df.drop("target", axis=1)
    y_train = train_df["target"]

    X_test = test_df.drop("target", axis=1)
    y_test = test_df["target"]

    # One hot encode the categorical columns
    ohe = OneHotEncoder(drop="first")

    # Scale the continuous columns
    sc = StandardScaler()

    # Column transformer to apply transformations on both categorical and continuous columns
    ct = ColumnTransformer([
        ("One Hot Encoding", ohe, cat_cols),
        ("Scaling", sc, cont_cols)
    ])
    
    # Random Forest Model
    rfc = RandomForestClassifier(n_estimators=args.n_estimators, 
                                 min_samples_split=args.min_samples_split,
                                 criterion=args.criterion)

    # Sklearn pipeline
    pipeline_rfc_model = Pipeline([
        ("Data Transformations", ct),
        ("Random Forest Model", rfc)
    ])

    # Fit the model locally on a smaller subset of data
    pipeline_rfc_model.fit(X_train, y_train)

    # Check the accuracy on training data
    train_accuracy = pipeline_rfc_model.score(X_train, y_train)
    print(f"Training Accuracy: {train_accuracy:.4f}")

    # Check the accuracy on test data
    test_accuracy = pipeline_rfc_model.score(X_test, y_test)
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Save the model
    model_save_path = os.path.join(args.model_dir, model_file_name)
    joblib.dump(pipeline_rfc_model, model_save_path)
    print(f"Model saved at {model_save_path}")

# Run the main function when the script runs
if __name__ == "__main__":
    main()

In [None]:
%%writefile requirements.txt
pandas
scikit-learn
fsspec
s3fs

In [None]:
# Train!
# Choose instance_type: https://aws.amazon.com/sagemaker/pricing/
# Choose framework_version: https://docs.aws.amazon.com/sagemaker/latest/dg/sklearn.html
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

sklearn_estimator = SKLearn(
    base_job_name="rfc-pipeline-run",
    framework_version="1.0-1",
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters={
        "n_estimators": 50,
        "min_samples_split": 0.05,
        "criterion": "gini"
    },
    instance_count=1,
    instance_type="ml.m5.large",
    use_spot_instances=True,
    max_wait=600,
    max_run=600,
    role=get_execution_role(),
)

# Launch Training job
sklearn_estimator.fit()

## Check the training job name

In [None]:
import boto3
sm_client = boto3.client("sagemaker")

training_job_name = sklearn_estimator.latest_training_job.name

# Obtain the location of the model stored on S3 - Optional
# You can directly copy the location of the artifact from S3 also!
model_artifact = sm_client.describe_training_job(
    TrainingJobName=training_job_name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f"Training job name: {training_job_name}")
print(f"Model storage location: {model_artifact}")

## Hyperparameter Tuning
- There are three types of parameters we can tune: https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html
    - IntegerParameter
    - ContinuousParameter
    - CategoricalParameter

In [None]:
# Test Regex
import re
metric_string = "Testing Accuracy: 0.8667"
re.findall(r"Testing Accuracy: ([0-9.]+).*$", metric_string)

In [None]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter

# Define exploration boundaries
hyperparameter_ranges = {
    "n_estimators": IntegerParameter(1, 20),
    "min_samples_split": ContinuousParameter(0.01, 0.5),
    "criterion": CategoricalParameter(["gini", "entropy"])
}

# Create optimizer
optimizer = HyperparameterTuner(
    base_tuning_job_name="rfc-pipeline-tuner",
    estimator=sklearn_estimator,
    hyperparameter_ranges=hyperparameter_ranges,
    objective_type="Maximize",
    objective_metric_name="test-accuracy",
    metric_definitions=[
        {"Name": "train-accuracy", "Regex": "Training Accuracy: ([0-9.]+).*$"},
        {"Name": "test-accuracy", "Regex": "Testing Accuracy: ([0-9.]+).*$"}
    ],
    max_jobs=10,
    max_parallel_jobs=2,
)

# Launch Optimizer job
optimizer.fit()

In [None]:
# Analyse tuning results
results = optimizer.analytics().dataframe()

results.sort_values("FinalObjectiveValue", ascending=False).head()