### Mobile Price Classification using SKLearn Custom Script in Sagemaker

In [1]:
!pip install sagemaker scikit-learn pandas numpy ipykernel

Collecting sagemaker
  Downloading sagemaker-2.232.1-py3-none-any.whl.metadata (16 kB)
Collecting pathos (from sagemaker)
  Downloading pathos-0.3.2-py3-none-any.whl.metadata (11 kB)
Collecting sagemaker-core<2.0.0,>=1.0.0 (from sagemaker)
  Downloading sagemaker_core-1.0.6-py3-none-any.whl.metadata (4.9 kB)
Collecting schema (from sagemaker)
  Downloading schema-0.7.7-py2.py3-none-any.whl.metadata (34 kB)
Collecting smdebug-rulesconfig==1.0.1 (from sagemaker)
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl.metadata (943 bytes)
Collecting tblib<4,>=1.7.0 (from sagemaker)
  Downloading tblib-3.0.0-py3-none-any.whl.metadata (25 kB)
Collecting platformdirs (from sagemaker)
  Downloading platformdirs-4.3.6-py3-none-any.whl.metadata (11 kB)
Collecting mock<5.0,>4.0 (from sagemaker-core<2.0.0,>=1.0.0->sagemaker)
  Downloading mock-4.0.3-py3-none-any.whl.metadata (2.8 kB)
Collecting ppft>=1.7.6.8 (from pathos->sagemaker)
  Downloading ppft-1.7.6.8-py3-none-any.whl.metadata (12 kB

In [32]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'phonepriceclassification' 
print("Using bucket " + bucket)

Using bucket phonepriceclassification


In [3]:
df = pd.read_csv("mob_price_classification_train.csv")

In [4]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df.shape

(2000, 21)

In [6]:
# ['Low_Risk','High_Risk'],[0,1]
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [7]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [8]:
df.shape

(2000, 21)

In [9]:
# Find the Percentage of Values are missing
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [10]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [11]:
label = features.pop(-1) #Remove just the last feature
label

'price_range'

In [12]:
x = df[features]
y = df[label]

In [13]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [14]:
# {0: 'Low_Risk',1: 'High_Risk'}
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [15]:
x.shape

(2000, 20)

In [16]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 20)
(300, 20)
(1700,)
(300,)


In [19]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [20]:
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [21]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1452,1450,0,2.1,0,1,0,31,0.6,114,5,...,1573,1639,794,11,5,9,0,1,1,1
1044,1218,1,2.8,1,3,0,39,0.8,150,7,...,1122,1746,1667,10,0,12,0,0,0,1
1279,1602,0,0.6,0,12,0,58,0.4,170,1,...,1259,1746,3622,17,2,17,0,1,1,3
674,1034,0,2.6,1,2,1,45,0.3,190,3,...,182,1293,969,15,1,7,1,0,0,0
1200,530,0,2.4,0,1,0,32,0.3,88,6,...,48,1012,959,17,7,6,0,1,0,0


In [21]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [22]:
testX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [23]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [24]:
bucket

'phone-price-classification'

In [33]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://phonepriceclassification/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://phonepriceclassification/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


In [26]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Writing script.py


In [29]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::495599735524:role/sagemaker",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600,
)

In [34]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-09-22-18-30-49-498


2024-09-22 18:30:51 Starting - Starting the training job...
2024-09-22 18:31:05 Starting - Preparing the instances for training...
2024-09-22 18:31:31 Downloading - Downloading input data...
2024-09-22 18:32:02 Downloading - Downloading the training image...
2024-09-22 18:32:53 Training - Training image download completed. Training in progress.
2024-09-22 18:32:53 Uploading - Uploading generated training model2024-09-22 18:32:44,974 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-09-22 18:32:44,977 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-22 18:32:45,016 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-09-22 18:32:45,160 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-22 18:32:45,172 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-22 18:32:45,183 sagemaker-training-toolkit INFO

In [35]:
sklearn_estimator.latest_training_job.wait(
    logs="None"
)  
"""
# This accesses the most recently created training job associated with the sklearn_estimator.
.wait(logs="None") This method pauses the execution of the script until the training job is complete. 
The logs="None" argument specifies that no logs will be printed to the console 
during the wait. You could also set it to "All" to see all logs or "Tail" to see 
logs in real-time.
"""

artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
"""
sm_boto3.describe_training_job(...): This calls the SageMaker Boto3 client to describe the specified training job.
TrainingJobName=sklearn_estimator.latest_training_job.name: This passes the name of the latest training job to the describe_training_job method, allowing it to fetch details about that job.
["ModelArtifacts"]["S3ModelArtifacts"]: This accesses the specific key in the returned dictionary that contains the S3 URI of the model artifacts. This is where the trained model is saved after the training job completes.
"""

print("Model artifact persisted at " + artifact)


2024-09-22 18:33:06 Starting - Preparing the instances for training
2024-09-22 18:33:06 Downloading - Downloading the training image
2024-09-22 18:33:06 Training - Training image download completed. Training in progress.
2024-09-22 18:33:06 Uploading - Uploading generated training model
2024-09-22 18:33:06 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-495599735524/RF-custom-sklearn-2024-09-22-18-30-49-498/output/model.tar.gz


In [36]:
artifact

's3://sagemaker-us-east-1-495599735524/RF-custom-sklearn-2024-09-22-18-30-49-498/output/model.tar.gz'

In [40]:
from sagemaker.sklearn.model import SKLearnModel
from time import (
    gmtime,
    strftime,
)  # These functions are used to generate a timestamp for the model name.


# This line creates a unique model name by appending the current time (in UTC) to the string "Custom-sklearn-model-". The format ensures that each model name is distinct, which is helpful for versioning.
model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 


# SKLearnModel class is used to create a model object for deploying Scikit-learn models in SageMaker.
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::495599735524:role/sagemaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)
"""
name=model_name: This sets the name of the model to the generated unique name.
model_data=artifact: This specifies the S3 path to the model artifacts that were produced by the training job (as obtained earlier). This is where the trained model is stored.
role="arn:aws:iam::566373416292:role/service-role/AmazonSageMaker-ExecutionRole-20230120T164209": This IAM role provides the necessary permissions for SageMaker to access the S3 bucket and other resources. Ensure this role has appropriate permissions for deployment and inference.
entry_point="script.py": This specifies the entry point script that SageMaker will use for inference. This script typically contains the logic for loading the model and processing input data for predictions.
framework_version=FRAMEWORK_VERSION: This sets the version of the Scikit-learn framework used during training, ensuring compatibility.
"""

'\nname=model_name: This sets the name of the model to the generated unique name.\nmodel_data=artifact: This specifies the S3 path to the model artifacts that were produced by the training job (as obtained earlier). This is where the trained model is stored.\nrole="arn:aws:iam::566373416292:role/service-role/AmazonSageMaker-ExecutionRole-20230120T164209": This IAM role provides the necessary permissions for SageMaker to access the S3 bucket and other resources. Ensure this role has appropriate permissions for deployment and inference.\nentry_point="script.py": This specifies the entry point script that SageMaker will use for inference. This script typically contains the logic for loading the model and processing input data for predictions.\nframework_version=FRAMEWORK_VERSION: This sets the version of the Scikit-learn framework used during training, ensuring compatibility.\n'

In [41]:
model_name

'Custom-sklearn-model-2024-09-22-18-45-29'

In [42]:
##Endpoints deployment


# This line creates a unique name for the endpoint by appending the current timestamp (in UTC) to the string "Custom-sklearn-model-". This helps in managing multiple endpoints by ensuring each one has a distinct name.
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

"""
model.deploy(...): This method deploys the model to a SageMaker endpoint.
initial_instance_count=1: This specifies that one instance should be created for the endpoint. This instance will handle incoming requests for predictions.
instance_type="ml.m4.xlarge": This specifies the type of EC2 instance to use for the endpoint. The ml.m4.xlarge instance type is a good choice for many workloads, providing a balance of CPU and memory resources.
endpoint_name=endpoint_name: This assigns the previously generated unique endpoint name to the new SageMaker endpoint.
"""

EndpointName=Custom-sklearn-model-2024-09-22-18-45-34


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-09-22-18-45-29
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-09-22-18-45-34
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-09-22-18-45-34


------!

'\nmodel.deploy(...): This method deploys the model to a SageMaker endpoint.\ninitial_instance_count=1: This specifies that one instance should be created for the endpoint. This instance will handle incoming requests for predictions.\ninstance_type="ml.m4.xlarge": This specifies the type of EC2 instance to use for the endpoint. The ml.m4.xlarge instance type is a good choice for many workloads, providing a balance of CPU and memory resources.\nendpoint_name=endpoint_name: This assigns the previously generated unique endpoint name to the new SageMaker endpoint.\n'

In [43]:
endpoint_name

'Custom-sklearn-model-2024-09-22-18-45-34'


extract a subset of data from a DataFrame in pandas and convert it into a list format. 
[features]:

This indexing operation selects a subset of columns from the DataFrame. features is likely a list of column names that you are interested in. The resulting DataFrame will only contain these specified columns.
[0:2]:

This slice operation selects the first two rows of the resulting DataFrame from the previous step. In pandas, the slicing notation [start:end] is inclusive of the start index and exclusive of the end index.
.values:

This attribute retrieves the underlying numpy array from the DataFrame, which contains the data in a format that can be easily manipulated.
.tolist():

This method converts the numpy array into a Python list. The final result will be a list of lists, where each inner list corresponds to a row of the selected features.


In [46]:
testX[features][
    0:2
].values.tolist()  


[[1454.0,
  1.0,
  0.5,
  1.0,
  1.0,
  0.0,
  34.0,
  0.7,
  83.0,
  4.0,
  3.0,
  250.0,
  1033.0,
  3419.0,
  7.0,
  5.0,
  5.0,
  1.0,
  1.0,
  0.0],
 [1092.0,
  1.0,
  0.5,
  1.0,
  10.0,
  0.0,
  11.0,
  0.5,
  167.0,
  3.0,
  14.0,
  468.0,
  571.0,
  737.0,
  14.0,
  4.0,
  11.0,
  0.0,
  1.0,
  0.0]]

In [45]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]


In [47]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '50c1b2e6-84a7-457e-8269-575c35cb454c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '50c1b2e6-84a7-457e-8269-575c35cb454c',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 22 Sep 2024 18:53:13 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}