In [2]:
import pandas as pd
# sagemaker
import sagemaker

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\rushikesh.darge\AppData\Local\sagemaker\sagemaker\config.yaml


In [None]:
!pip install sagemaker pandas boto3 numpy scikit-learn

In [19]:
df = pd.read_csv(r'..\data\data_files\dataset_file.csv')
print(df.shape)

(2000, 21)


In [21]:
df.sample(3)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1623,1862,0,1.5,0,1,0,62,1.0,182,1,...,386,1046,1017,11,3,9,1,0,0,1
1610,1793,0,2.7,0,12,1,44,0.7,175,5,...,655,1459,2803,19,9,6,1,1,1,3
849,1286,1,2.3,0,10,1,16,0.1,98,2,...,18,662,424,10,5,4,1,1,0,0


In [22]:
# split the dataset into train and test
from sklearn.model_selection import train_test_split
X = df.drop("price_range", axis = 1)
y = df["price_range"]

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.15, random_state = 101)

In [23]:
# printing shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 20)
(300, 20)
(1700,)
(300,)


In [24]:
# create dataframe
trainX = pd.DataFrame(X_train)
trainX['price_range'] = y_train

testX = pd.DataFrame(X_test)
testX['price_range'] = y_test

In [25]:
# print shape
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [27]:
# saving the dataframe
trainX.to_csv(r'..\data\model_ready_data\train_V-1.csv', index = False)
testX.to_csv(r'..\data\model_ready_data\test_V-1.csv', index = False)

# Uploading data

In [28]:
import sagemaker
import boto3

# initialization of sagemaker session
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'mobbucket-sagemaker123'
print('Using bucket {}'.format(bucket))

Using bucket mobbucket-sagemaker123


In [33]:
# send data to s3 for sagemaker training
sk_prefix = 'sagemaker/classification_model/container'

train_path = sess.upload_data(r'D:\Projects\data_engineering\aws_sagemaker\data\model_ready_data\train_V-1.csv', bucket=bucket, key_prefix=sk_prefix)
test_path = sess.upload_data(r'D:\Projects\data_engineering\aws_sagemaker\data\model_ready_data\test_V-1.csv', bucket=bucket, key_prefix=sk_prefix)

# Training on Sagemaker

In [52]:
%%writefile training_script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
import sklearn 
import joblib 
import boto3 
import pathlib
from io import StringIO
import argparse 
import joblib 
import os
import numpy as np 
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the
    parser.add_argument("--n_estimators", type=int, default=10) 
    parser.add_argument("--random_state", type=int, default=0)

    # Data model and output directories
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train_data", type=str, default="train_V-1.csv")
    parser.add_argument("--test_data", type=str, default="test_V-1.csv")

    args, _ = parser.parse_known_args()

    train_df = pd.read_csv(os.path.join(args.train, args.train_data))
    test_df = pd.read_csv(os.path.join(args.test, args.test_data))

    print("[INFO] train data shape: {}".format(train_df.shape))
    print("[INFO] test data shape: {}".format(test_df.shape))

    features = list(train_df.columns)
    label = features.pop(-1)

    X_train = train_df[features].values
    y_train = train_df[label].values

    X_test = test_df[features].values
    y_test = test_df[label].values

    print("[INFO] training model")
    clf = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose=1)
    clf.fit(X_train, y_train)

    print("[INFO] saving model")
    joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))

    y_pred_test = clf.predict(X_test)
    print("[INFO] classification report")
    print(classification_report(y_test, y_pred_test))

    print("[INFO] confusion matrix")
    print(confusion_matrix(y_test, y_pred_test))

    print("[INFO] accuracy score")
    print(accuracy_score(y_test, y_pred_test))

Overwriting training_script.py


# Running script

In [53]:
train_path

's3://mobbucket-sagemaker123/sagemaker/classification_model/container/train_V-1.csv'

In [54]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    entry_point="training_script.py",
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.m5.large",
    role='arn:aws:iam::891377385044:role/service-role/AmazonSageMaker-ExecutionRole-20240209T115222',
    sagemaker_session=sess,
    instance_count=1,
    base_job_name="custom-training-sagemaker",
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600,
)

sklearn_estimator.fit({"train": train_path, "test": test_path}, wait=True)

INFO:sagemaker:Creating training-job with name: custom-training-sagemaker-2024-07-08-14-24-56-687


2024-07-08 14:24:58 Starting - Starting the training job...
2024-07-08 14:25:14 Starting - Preparing the instances for training...
2024-07-08 14:25:45 Downloading - Downloading input data...
2024-07-08 14:26:10 Downloading - Downloading the training image...
2024-07-08 14:27:06 Training - Training image download completed. Training in progress.
2024-07-08 14:27:06 Uploading - Uploading generated training model2024-07-08 14:27:00,163 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-07-08 14:27:00,167 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-08 14:27:00,215 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-07-08 14:27:00,411 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-08 14:27:00,424 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-08 14:27:00,437 sagemaker-training-toolkit INFO

# Creating endpoint

In [56]:
# sklearn_estimator.latest_training_job.wait(logs="None")
# artifact = sm_boto3.describer_training_job(
#     TrainingJobName=sklearn_estimator.latest_training_job.name
# )['ModelArtifacts']['S3ModelArtifacts']

# print("Model Artifacts: {}".format(artifact))

In [61]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = 'custom-sklearn-model-V1'

model = SKLearnModel(
    name=model_name,
    model_data='s3://sagemaker-us-east-1-891377385044/custom-training-sagemaker-2024-07-08-14-24-56-687/output/model.tar.gz',
    role='arn:aws:iam::891377385044:role/service-role/AmazonSageMaker-ExecutionRole-20240209T115222',
    entry_point="training_script.py",
    framework_version=FRAMEWORK_VERSION
)

In [62]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x1c3a7a0a3e0>

In [64]:
endpoint_name = model_name

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

INFO:sagemaker:Creating model with name: custom-sklearn-model-V1
INFO:sagemaker:Creating endpoint-config with name custom-sklearn-model-V1
INFO:sagemaker:Creating endpoint with name custom-sklearn-model-V1


------!

In [65]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x1c3a4cb70a0>

In [74]:
X_test.values[2:4].tolist()

[[586.0,
  0.0,
  1.4,
  1.0,
  10.0,
  1.0,
  8.0,
  0.5,
  142.0,
  8.0,
  14.0,
  116.0,
  598.0,
  3178.0,
  9.0,
  7.0,
  10.0,
  1.0,
  0.0,
  1.0],
 [1180.0,
  1.0,
  1.2,
  0.0,
  5.0,
  1.0,
  50.0,
  1.0,
  144.0,
  4.0,
  20.0,
  925.0,
  1252.0,
  1464.0,
  7.0,
  3.0,
  20.0,
  1.0,
  1.0,
  1.0]]

In [75]:
print(predictor.predict(X_test.values[2:4].tolist()))

[2 0]


In [76]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '60b19250-fcb2-4314-b117-fcaf9b09a9c1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '60b19250-fcb2-4314-b117-fcaf9b09a9c1',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 08 Jul 2024 14:55:07 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}