# Multiple Models by Versions

## Training models

Previous activity: preparing and splitting data

In [None]:
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer

In [None]:
# Set the file directories
bucket_name = "sagemaker-loan-classification"
prefix = "xgboost"

# Set input and output paths for sagemaker
input_train = sagemaker.TrainingInput(
    s3_data=f"s3://{bucket_name}/{prefix}/train", content_type="csv")
input_test = sagemaker.TrainingInput(
    s3_data=f"s3://{bucket_name}/{prefix}/test", content_type="csv")
output_path = f"s3://{bucket_name}/{prefix}/saved_model"

In [None]:
# Build the estimator
def xgboost_fit_ver(version, use_spot_instance=False, max_run=3200, max_wait=7600):
    name = f"xgboost-loan-{version}"
    print(f"____Running {}")

    # Set checkpoint
    checkpoint = None
    if use_spot_instances == True:
        checkpoint = f"s3://{bucket_name}/{prefix}/checkpoints/{name}"
        print(f"___The checkpoint is saved to: {checkpoint}")
        max_wait = None

    # Set the container based on the version
    container = sagemaker.image_uris.retrieve("xgboost",
                                              sagemaker.Session().boto_region_name,
                                              version=version)

    xgb = sagemaker.estimator.Estimator(
        image_uri=container,
        role=sagemaker.get_execution_role(),
        instance_count=1,
        instance_type="ml.m5.xlarge",
        output_path=output_path,
        sagemaker_session=sagemaker.Session(),
        checkpoint_s3_uri=checkpoint
        use_spot_instance=use_spot_instance,
        max_run=max_run,
        max_wait=max_wait,
        base_job_name=name,
    )

    # Set the hyperparameters
    xgb.set_hyperparameters(
        colsample_bytree=0.478,
        gamma=2.387,
        eta=0.175,
        max_depth=15,
        min_child_weight=7,
        num_round=86,
        subsample=0.80,
        num_class=2,
        objective="multi:softmax",
    )

    # Fit the model
    xgb.fit(
        {
            "train": input_train,
            "validation": input_test
        }
    )

    print(f"____Finish running {xgb.latest_training_job.name}")

In [None]:
# Train Xgboost version 0.90-1
xgboost_fit_ver(version="0.90-1", use_spot_instance=False)

In [None]:
# Train Xgboost version 0.90-1
xgboost_fit_ver(version="1.3-1", use_spot_instance=False)

## Deploy the models in AWS

## Model predictions

In [None]:
# load test data
test = pd.read_csv("test.csv")
test = test.drop(columns=["Loan_Status"]).values

In [None]:
# Load model
xgb_model = sagemaker.predictor.Predictor(endpoint_name="xgboost-version")
xgb_model.serializer = CSVSerializer()

In [None]:
# target_variant=None, endpoint distributes the test data based on the model weight.
pred1 = xgb_model.predict(test, target_variant=None).decode('utf-8')
print(pred1[:10])

In [None]:
pred1 = np.fromstring(pred1[1:], sep=',')

In [None]:
# target_variant=0.90-1
pred2 = xgb_model.predict(test, target_variant="version-0.90-1").decode('utf-8')  
pred2 = np.fromstring(pred2[1:], sep=',')

In [None]:
# target_variant=1.3-1
pred3 = xgb_model.predict(test, target_variant="version-1.3-1").decode('utf-8')
pred3 = np.fromstring(pred3[1:], sep=',')

In [None]:
print(f"""
The number of predictions when:
None = 0.90-1 = 1.3-1: {(sum((pred1 == pred2) & (pred2 == pred3)))}
None = 0.90-1 != 1.3-1: {(sum((pred1 == pred2) & (pred2 != pred3)))}
None != 0.90-1 = 1.3-1: {(sum((pred1 != pred2) & (pred2 == pred3)))}
None = 1.3-1 != 0.90-1: {(sum((pred1 == pred3) & (pred2 != pred3)))}
""")

In [None]:
# Delete the endpoints if not needed anymore.
# xgb_model.delete_endpoint()