## Sagemaker - Update Recommendation Model

In [1]:
import sklearn # Check Sklearn version
sklearn.__version__

'1.4.1.post1'

## 1. Initialize Boto3 SDK and create S3 bucket. 

In [2]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd
from io import BytesIO

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name

s3_bucket = "exercises-dataset"
bucket = "ml-data-repository-pp2959"
s3_client = boto3.client('s3')
s3_key_meta = 'last_trained.txt'


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
def get_last_train_timestamp(s3_bucket, s3_key):
    try:
        print("FETCHING LAST TRAIN TIMESTAMP .... ")
        response = s3_client.get_object(Bucket=s3_bucket, Key=s3_key)
        last_modified = response['Body'].read()
        return last_modified.decode("utf-8")
    except Exception as e:
        print(f"Error retrieving last dump timestamp: {e}")
        return '2024-05-12 17:40:40'  # Default timestamp if not found

In [4]:
def read_csv_from_s3(bucket_name, file_key):
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    csv_content = response['Body'].read()
    df = pd.read_csv(BytesIO(csv_content))
    return df

## 3. Data Exploration and Understanding.

In [5]:
get_last_train_timestamp(s3_bucket, "last_modified.txt")

FETCHING LAST TRAIN TIMESTAMP .... 


'2024-05-13 03:31:17'

In [6]:
s3_key = "last_modified.txt"
s3_key = f"exercise_data.csv"
print(s3_key)
df = read_csv_from_s3(s3_bucket, s3_key)

exercise_data.csv


In [7]:
df.head(10)

Unnamed: 0,user_id,age,gender,height,weight,heartrate,steps,exercise_id,timestamp
0,7,29,0,160,54,136.120604,15564,4,2023-05-14 20:03:28
1,7,74,0,161,93,117.660263,15993,458,2023-05-17 22:34:08
2,7,28,1,189,72,174.285228,5222,817,2023-05-19 04:33:20
3,7,59,1,168,95,114.096413,3970,339,2023-05-19 23:02:24
4,7,72,0,156,99,142.019195,18807,356,2023-05-21 05:03:12
5,7,46,0,158,51,74.003411,8090,750,2023-05-21 14:44:13
6,7,24,0,186,50,195.679663,17839,5,2023-05-24 17:31:46
7,7,43,1,165,59,97.875524,12809,791,2023-05-25 05:51:59
8,7,30,0,162,100,78.109635,5262,379,2023-05-30 13:40:07
9,7,26,1,182,85,114.581288,4339,715,2023-06-01 11:03:48


In [8]:
df.shape

(273, 9)

In [9]:
df.columns

Index(['user_id', 'age', 'gender', 'height', 'weight', 'heartrate', 'steps',
       'exercise_id', 'timestamp'],
      dtype='object')

In [10]:
# ['Low_Risk','High_Risk'],[0,1]
df = df.drop(['user_id', 'timestamp'], axis=1)
df = pd.get_dummies(df)

In [11]:
df.columns

Index(['age', 'gender', 'height', 'weight', 'heartrate', 'steps',
       'exercise_id'],
      dtype='object')

In [12]:
df.shape

(273, 7)

In [13]:
# Find the Percentage of Values are missing
df.isnull().mean() * 100

age            0.0
gender         0.0
height         0.0
weight         0.0
heartrate      0.0
steps          0.0
exercise_id    0.0
dtype: float64

In [14]:
features = list(df.columns)
features

['age', 'gender', 'height', 'weight', 'heartrate', 'steps', 'exercise_id']

In [15]:
label = 'exercise_id'
features.remove(label)
label

'exercise_id'

In [16]:
x = df[features]
y = df[label]

In [17]:
x.head()

Unnamed: 0,age,gender,height,weight,heartrate,steps
0,29,0,160,54,136.120604,15564
1,74,0,161,93,117.660263,15993
2,28,1,189,72,174.285228,5222
3,59,1,168,95,114.096413,3970
4,72,0,156,99,142.019195,18807


In [18]:
# {0: 'Low_Risk',1: 'High_Risk'}
y.head()

0      4
1    458
2    817
3    339
4    356
Name: exercise_id, dtype: int64

In [19]:
x.shape

(273, 6)

In [20]:
y.value_counts()

exercise_id
4      10
5       9
2       5
3       4
161     3
       ..
291     1
755     1
333     1
67      1
297     1
Name: count, Length: 217, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42, shuffle = True)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(204, 6)
(69, 6)
(204,)
(69,)


## 4. Split the data into Train/Test CSV File. 

In [23]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [24]:
print(trainX.shape)
print(testX.shape)

(204, 7)
(69, 7)


In [25]:
trainX.head()

Unnamed: 0,age,gender,height,weight,heartrate,steps,exercise_id
73,20,1,154,89,103.704838,17557,572
18,25,1,158,62,83.94152,15335,835
101,66,1,168,70,65.150717,4451,561
197,47,1,187,99,101.066757,7811,189
112,27,1,170,92,192.317727,4870,253


In [26]:
trainX.isnull().sum()

age            0
gender         0
height         0
weight         0
heartrate      0
steps          0
exercise_id    0
dtype: int64

In [27]:
testX.isnull().sum()

age            0
gender         0
height         0
weight         0
heartrate      0
steps          0
exercise_id    0
dtype: int64

## 5. Upload data into the S3 Bucket.

In [28]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [29]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/exercise-data/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket = bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket = bucket, key_prefix=sk_prefix
)

In [30]:
testpath

's3://ml-data-repository-pp2959/sagemaker/exercise-data/sklearncontainer/test-V-1.csv'

In [31]:
trainpath

's3://ml-data-repository-pp2959/sagemaker/exercise-data/sklearncontainer/train-V-1.csv'

## 6. Create Training Script

In [46]:
%%writefile exercise_script.py


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import validation_curve, train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
import boto3

s3 = boto3.client("s3")
train_file = 'train-V-1.csv'
train_path = os.path.join("./", train_file)
s3.download_file("ml-data-repository-pp2959", "sagemaker/exercise-data/sklearncontainer/" + train_file, train_path)
train_df = pd.read_csv(train_path)
y_train = train_df["exercise_id"]

# inference functions ---------------

def input_fn(request_body, request_content_type):
    print(request_body)
    print(request_content_type)
    if request_content_type == "text/csv":
        request_body = request_body.strip()
        try:
            df = pd.read_csv(StringIO(request_body), header=None)
            return df
        
        except Exception as e:
            print(e)
    else:
        return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

def predict_fn(input_data, model):
    if type(input_data) != str:
        distances, indices = model.kneighbors(input_data)
        y_pred_test = []
        for n_idx in indices:
            y_pred_test.append(y_train[n_idx])
        print(y_pred_test)
        return y_pred_test
    else:
        return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    parser.add_argument("--s3-bucket", type=str, default="ml-data-repository-pp2959")
    parser.add_argument("--s3-data-key", type=str, default="sagemaker/exercise-data/sklearncontainer/")
    parser.add_argument("--s3-model-key", type=str, default="sagemaker/exercise-models/")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    
    train_path = os.path.join(args.train, args.train_file)
    test_path = os.path.join(args.test, args.test_file)
    model_path = os.path.join(args.model_dir, "model.joblib")
    s3.download_file(args.s3_bucket, args.s3_data_key + args.train_file, train_path)
    s3.download_file(args.s3_bucket, args.s3_data_key + args.test_file, test_path)
    s3.download_file(args.s3_bucket, args.s3_model_key + "model.joblib", model_path)
    
    print("MODEL PATH : ", model_path)
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print(train_df.head())
    print(test_df.head())
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    
    max_k = 4
    n_fold = 10
    target = 0.1

    model = KNeighborsRegressor(max_k)
#     model = joblib.load(model_path)
    model.fit(X_train, y_train)
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    s3.upload_file(model_path, args.s3_bucket, args.s3_model_key + "model.joblib")
    print("Model persisted at " + args.s3_bucket + args.s3_model_key + "model.joblib")
    print()

    
    distances, indices = model.kneighbors(X_test)
    y_pred_test = []
    for n_idx in indices:
        y_pred_test.append(y_train[n_idx])
    

#     print()
#     print("---- METRICS RESULTS FOR TESTING DATA ----")
#     print()
#     print("Total Rows are: ", X_test.shape[0])
#     print('[TESTING] Model Accuracy is: ', test_acc)
#     print('[TESTING] Testing Report: ')
#     print(test_rep)


Overwriting exercise_script.py


In [47]:
! python exercise_script.py --n_estimators 100 \
                   --random_state 0 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

[INFO] Extracting arguments
SKLearn Version:  1.4.1.post1
Joblib Version:  1.3.2
[INFO] Reading data

MODEL PATH :  ./model.joblib
   age  gender  height  weight   heartrate  steps  exercise_id
0   20       1     154      89  103.704838  17557          572
1   25       1     158      62   83.941520  15335          835
2   66       1     168      70   65.150717   4451          561
3   47       1     187      99  101.066757   7811          189
4   27       1     170      92  192.317727   4870          253
   age  gender  height  weight   heartrate  steps  exercise_id
0   44       1     166      71  116.765969  18190          761
1   75       0     181      89  101.177200   9326          190
2   78       1     189      70   69.639234   8372          293
3   79       1     162      89  166.047275  16158            5
4   71       0     182      50   79.183069   6565          356
Building training and testing datasets

Column order: 
['age', 'gender', 'height', 'weight', 'heartrate', 'steps'

## 7. Train script in-side Sagemaker container.

In [48]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.2-1"

sklearn_estimator = SKLearn(
    entry_point="exercise_script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="Exercise-Model",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [49]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: Exercise-Model-2024-05-13-15-15-54-768


2024-05-13 15:15:55 Starting - Starting the training job...
2024-05-13 15:16:13 Starting - Preparing the instances for training...
2024-05-13 15:16:47 Downloading - Downloading input data...
2024-05-13 15:17:17 Downloading - Downloading the training image......
2024-05-13 15:18:18 Training - Training image download completed. Training in progress.
2024-05-13 15:18:18 Uploading - Uploading generated training model[34m2024-05-13 15:18:12,610 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-05-13 15:18:12,614 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-05-13 15:18:12,617 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-05-13 15:18:12,634 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-05-13 15:18:12,880 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m


## 8. Store Model Artifacts(model.tar.gz) into the S3 Bucket. 

In [56]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-05-13 15:18:34 Starting - Preparing the instances for training
2024-05-13 15:18:34 Downloading - Downloading the training image
2024-05-13 15:18:34 Training - Training image download completed. Training in progress.
2024-05-13 15:18:34 Uploading - Uploading generated training model
2024-05-13 15:18:34 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-891376963426/Exercise-Model-2024-05-13-15-15-54-768/output/model.tar.gz


In [57]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

In [52]:
model_name = "Recommendation-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

In [53]:
model_name

'Recommendation-model-2024-05-13-15-19-20'

In [61]:
import boto3

# Initialize SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Specify the S3 location of the new model artifact
new_model_url = artifact

# Specify the name of the existing SageMaker endpoint configuration
endpoint_config_name = 'Recommend-Exercise-Model-3'

# Specify the name of the existing SageMaker endpoint
endpoint_name = 'Recommend-Exercise-Model'

response = sagemaker_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'VariantName': 'variant-1',
        'ModelName': model_name,
        'InitialInstanceCount': 1,
        'InstanceType': 'ml.m5.large',
        'InitialVariantWeight': 1
    }]
)

print(response)


{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:891376963426:endpoint-config/Recommend-Exercise-Model-3', 'ResponseMetadata': {'RequestId': '1705fd9a-6bcc-42d2-a1de-b956a6a02b04', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '1705fd9a-6bcc-42d2-a1de-b956a6a02b04', 'content-type': 'application/x-amz-json-1.1', 'content-length': '107', 'date': 'Mon, 13 May 2024 15:27:36 GMT'}, 'RetryAttempts': 0}}


In [62]:
# Update the SageMaker endpoint configuration to use the updated model
response = sagemaker_client.update_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
    RetainAllVariantProperties=True,
)

# Print the response
print(response)


{'EndpointArn': 'arn:aws:sagemaker:us-east-1:891376963426:endpoint/Recommend-Exercise-Model', 'ResponseMetadata': {'RequestId': '00dd7af7-c77a-41a6-9591-3fcfa9b1ed13', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '00dd7af7-c77a-41a6-9591-3fcfa9b1ed13', 'content-type': 'application/x-amz-json-1.1', 'content-length': '92', 'date': 'Mon, 13 May 2024 15:27:41 GMT'}, 'RetryAttempts': 0}}


## 9. Deploy Sagemaker Endpoint(API) for trained model, and test it. 

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(artifact)
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="exercise_script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
endpoint_name = "Recommend-Exercise-Model"
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    update_endpoint = True
)

In [None]:
testX.iloc[:,:-1]

In [None]:
text_csv = testX.iloc[:, :-1].to_csv(index = False, header = False)
print(text_csv)

In [None]:
import requests
import json

In [None]:
sagemaker_runtime = boto3.client('runtime.sagemaker')

In [None]:
response = sagemaker_runtime.invoke_endpoint(EndpointName = endpoint_name,
                                            ContentType = 'text/csv',
                                            Body = text_csv)

In [None]:
print(response)

In [None]:
result = json.loads(response['Body'].read().decode())
print(result)

In [None]:
test_features = np.array(testX[features][0:2].values.tolist()[:-1])
test_features

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

## Don't forget to delete the endpoint !

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)