# Modeling

In [1]:
!pip install -U pandas pandas-profiling scikit-learn sagemaker

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

## Load the train data

In [2]:
import pandas as pd

train_df = pd.read_csv(
    "s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/train.csv", 
    nrows=100
)

test_df = pd.read_csv("s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test.csv")

print(train_df.shape)
train_df.head()

(100, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0
1,39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
2,60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,1
3,39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
4,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0


## Exploratory Data Analysis

In [3]:
from pandas_profiling import ProfileReport

In [4]:
profile = ProfileReport(train_df)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Split Features and Response

In [5]:
# Define the columns
cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

In [6]:
# Split X(features) and y(response)
X_train = train_df.drop("target", axis=1)
y_train = train_df["target"]

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

## Data Transformations

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [8]:
# One hot encode the categorical columns
ohe = OneHotEncoder(drop="first")

# Scale the continuous columns
sc = StandardScaler()

# Column transformer to apply transformations on both categorical and continuous columns
ct = ColumnTransformer([
    ("One Hot Encoding", ohe, cat_cols),
    ("Scaling", sc, cont_cols)
])

## ML Model

In [9]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

## Pipeline to combine feature engineering and ML model

In [10]:
# Sklearn pipeline
from sklearn.pipeline import Pipeline

pipeline_knn_model = Pipeline([
    ("Data Transformations", ct),
    ("KNN Model", knn)
])

## Fit the Pipeline Model locally
- We run it locally to ensure there are no bugs in the code!
- For this "test" purpose we can just run it on a smaller subset of the data

In [11]:
# To view tha Pipeline model as a diagram
from sklearn import set_config
set_config(display="diagram")

In [12]:
# Fit the model locally on a smaller subset of data
pipeline_knn_model.fit(X_train, y_train)

In [13]:
# Check the accuracy on training data
train_accuracy = pipeline_knn_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Check the accuracy on test data
test_accuracy = pipeline_knn_model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.8300
Testing Accuracy: 0.8333


## Fit the Pipeline Model on Sagemaker!
- Since the model is free from bugs, we can train it on the full dataset.
- Sagemaker training allows us to scale training to large datasets.
- First we need to put all the code into a .py script
- Sagemaker API documentation: https://sagemaker.readthedocs.io/en/stable/api/index.html

In [14]:
%%writefile train.py

import argparse
import os
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

model_file_name = "pipeline_model.joblib"

# Main function
def main():
    # Arguments
    parser = argparse.ArgumentParser()
    
    # Inbuilt Arguments: https://github.com/aws/sagemaker-containers#id11
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    
    # Custom Arguments
    parser.add_argument("--n_neighbors", type=int, default=5)
    
    args, _ = parser.parse_known_args()
    
    # Load data
    train_df = pd.read_csv("s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/train.csv")
    test_df = pd.read_csv("s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test.csv")

    # Define the columns
    cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
    cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

    # Split X(features) and y(response)
    X_train = train_df.drop("target", axis=1)
    y_train = train_df["target"]

    X_test = test_df.drop("target", axis=1)
    y_test = test_df["target"]

    # One hot encode the categorical columns
    ohe = OneHotEncoder(drop="first")

    # Scale the continuous columns
    sc = StandardScaler()

    # Column transformer to apply transformations on both categorical and continuous columns
    ct = ColumnTransformer([
        ("One Hot Encoding", ohe, cat_cols),
        ("Scaling", sc, cont_cols)
    ])
    
    # KNN Model
    knn = KNeighborsClassifier(n_neighbors=args.n_neighbors)

    # Sklearn pipeline
    pipeline_knn_model = Pipeline([
        ("Data Transformations", ct),
        ("KNN Model", knn)
    ])

    # Fit the model locally on a smaller subset of data
    pipeline_knn_model.fit(X_train, y_train)

    # Check the accuracy on training data
    train_accuracy = pipeline_knn_model.score(X_train, y_train)
    print(f"Training Accuracy: {train_accuracy:.4f}")

    # Check the accuracy on test data
    test_accuracy = pipeline_knn_model.score(X_test, y_test)
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Save the model
    model_save_path = os.path.join(args.model_dir, model_file_name)
    joblib.dump(pipeline_knn_model, model_save_path)
    print(f"Model saved at {model_save_path}")

# Run the main function when the script runs
if __name__ == "__main__":
    main()

Overwriting train.py


In [15]:
%%writefile requirements.txt
pandas
scikit-learn
fsspec
s3fs

Overwriting requirements.txt


In [17]:
# Train!
# Choose instance_type: https://aws.amazon.com/sagemaker/pricing/
# Choose framework_version: https://docs.aws.amazon.com/sagemaker/latest/dg/sklearn.html
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

sklearn_estimator = SKLearn(
    base_job_name="knn-pipeline-run",
    framework_version="1.0-1",
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters={
        "n_neighbors": 5
    },
    metric_definitions=[        
        {"Name": "train-accuracy", "Regex": "Training Accuracy: ([0-9.]+).*$"},
        {"Name": "test-accuracy", "Regex": "Testing Accuracy: ([0-9.]+).*$"}
    ],
    instance_count=1,
    instance_type="ml.m5.large",
    use_spot_instances=True,
    max_wait=600,
    max_run=600,
    role=get_execution_role(),
)

# Launch Training job
sklearn_estimator.fit()

2022-06-11 09:09:15 Starting - Starting the training job...
2022-06-11 09:09:17 Starting - Launching requested ML instancesProfilerReport-1654938555: InProgress
.........
2022-06-11 09:11:05 Starting - Preparing the instances for training.........
2022-06-11 09:12:46 Downloading - Downloading input data
2022-06-11 09:12:46 Training - Downloading the training image...
2022-06-11 09:13:06 Training - Training image download completed. Training in progress.[34m2022-06-11 09:13:00,534 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-06-11 09:13:00,536 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-11 09:13:00,545 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-06-11 09:13:32,470 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting

## Check the location of the saved model

In [18]:
import boto3
sm_client = boto3.client("sagemaker")

# Obtain the location of the model stored on S3 - Optional
# You can directly copy the location of the artifact from S3 also!
training_job_name = sklearn_estimator.latest_training_job.name
model_artifact = sm_client.describe_training_job(
    TrainingJobName=training_job_name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f"Training job name: {training_job_name}")

Training job name: knn-pipeline-run-2022-06-11-09-09-15-420
Model artifact is stored at:
s3://sagemaker-us-east-1-298138509966/knn-pipeline-run-2022-06-11-09-09-15-420/output/model.tar.gz


## Hyperparameter Tuning
- There are three types of parameters we can tune: https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html
    - IntegerParameter
    - ContinuousParameter
    - CategoricalParameter

In [20]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
    "n_neighbors": IntegerParameter(1, 20),
}

# Create optimizer
optimizer = HyperparameterTuner(
    base_tuning_job_name="knn-pipeline-tuner",
    estimator=sklearn_estimator,
    hyperparameter_ranges=hyperparameter_ranges,
    objective_type="Maximize",
    objective_metric_name="train-accuracy",
    metric_definitions=[
        {"Name": "train-accuracy", "Regex": "Training Accuracy: ([0-9.]+).*$"},
        {"Name": "test-accuracy", "Regex": "Testing Accuracy: ([0-9.]+).*$"}
    ],
    max_jobs=10,
    max_parallel_jobs=2,
)

# Launch Optimizer job
optimizer.fit()

............................................................................................................................................................................................................................................................................................!


In [24]:
# Analyse tuning results
results = optimizer.analytics().dataframe()

results.sort_values("FinalObjectiveValue", ascending=False).head()

Unnamed: 0,n_neighbors,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
9,1.0,knn-pipeline-tuner-220611-0916-001-00a849dc,Completed,1.0,2022-06-11 09:19:17+00:00,2022-06-11 09:21:03+00:00,106.0
4,3.0,knn-pipeline-tuner-220611-0916-006-77833ec6,Completed,0.8764,2022-06-11 09:28:23+00:00,2022-06-11 09:29:44+00:00,81.0
0,5.0,knn-pipeline-tuner-220611-0916-010-b19d6018,Completed,0.8577,2022-06-11 09:38:45+00:00,2022-06-11 09:39:44+00:00,59.0
1,4.0,knn-pipeline-tuner-220611-0916-009-73d379b9,Completed,0.8464,2022-06-11 09:37:30+00:00,2022-06-11 09:38:44+00:00,74.0
5,2.0,knn-pipeline-tuner-220611-0916-005-dd8c8ae7,Completed,0.8464,2022-06-11 09:28:12+00:00,2022-06-11 09:29:50+00:00,98.0
