In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\com\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "mobpredictionbucket"
print("Using bucket "+bucket)

Using bucket mobpredictionbucket


In [3]:
df = pd.read_csv("mobile_prediction.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
df.shape

(2000, 21)

In [5]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [6]:
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [7]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [8]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [9]:
label = features.pop(-1)

In [10]:
label

'price_range'

In [11]:
x = df[features]
y = df[label]

In [12]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [13]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [14]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [16]:
(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

((1700, 20), (300, 20), (1700,), (300,))

In [17]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [18]:
(trainX.shape, testX.shape)

((1700, 21), (300, 21))

In [19]:
trainX.to_csv("train-V-1.csv", index=False)
testX.to_csv("test-V-1.csv", index=False)

In [20]:
sk_prefix = "sagemaker/mobile_price_prediction/sklearncontainer"
trainpath = sess.upload_data(
    path = 'train-V-1.csv', bucket=bucket, key_prefix=sk_prefix,
)
testpath = sess.upload_data(
    path = 'test-V-1.csv', bucket=bucket, key_prefix=sk_prefix,
)

In [21]:
print(trainpath, testpath)

s3://mobpredictionbucket/sagemaker/mobile_price_prediction/sklearncontainer/train-V-1.csv s3://mobpredictionbucket/sagemaker/mobile_price_prediction/sklearncontainer/test-V-1.csv


In [39]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print(f"Independemt features: {features}")
    print(f"Dependent features: {label}")

    print("Training model")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print(f"Model persisted at {model_path}")

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_report =classification_report(y_test, y_pred_test)

    print(f"Test accuracy: {test_acc}")
    print(f"Test report:\n {test_report}")



Overwriting script.py


In [40]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::448049826131:role/mobile_prediction",
    instance_count=1,
    instance_type = "ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    # use_spot_instance = True,
    # max_wait = 7200,
    max_run = 3600
)

In [41]:
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-08-25-19-42-19-640


2024-08-25 19:42:28 Starting - Starting the training job...
2024-08-25 19:43:04 Downloading - Downloading input data...
2024-08-25 19:43:29 Downloading - Downloading the training image...
2024-08-25 19:44:10 Training - Training image download completed. Training in progress..2024-08-25 19:44:13,964 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-08-25 19:44:13,968 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,008 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-08-25 19:44:14,173 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,185 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,198 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,207 sagemaker-training-toolkit INFO     Invoking user script
Trai

In [42]:
sklearn_estimator.latest_training_job.wait(logs=None)
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f"Model artifact persisted at {artifact}")

2024-08-25 19:44:33 Starting - Preparing the instances for training
2024-08-25 19:44:33 Downloading - Downloading the training image
2024-08-25 19:44:33 Training - Training image download completed. Training in progress.
2024-08-25 19:44:33 Uploading - Uploading generated training model
2024-08-25 19:44:33 Completed - Training job completed2024-08-25 19:44:13,964 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-08-25 19:44:13,968 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,008 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-08-25 19:44:14,173 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,185 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14,198 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-25 19:44:14

In [43]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-"+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::448049826131:role/mobile_prediction",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [44]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x1e4aeb80640>

In [47]:
endpoint_name = "Custom-sklearn-model-"+strftime("%Y-%m-%d-%H-%M-%S", gmtime())

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name=endpoint_name
)

INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-08-25-19-47-45
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-08-25-19-51-34
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-08-25-19-51-34


-----!

In [48]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x1e4accabb80>

In [49]:
endpoint_name

'Custom-sklearn-model-2024-08-25-19-51-34'

In [50]:
sample = testX[features][0:2].values.tolist()
sample

[[1646.0,
  0.0,
  2.5,
  0.0,
  3.0,
  1.0,
  25.0,
  0.6,
  200.0,
  2.0,
  5.0,
  211.0,
  1608.0,
  686.0,
  8.0,
  6.0,
  11.0,
  1.0,
  1.0,
  0.0],
 [1182.0,
  0.0,
  0.5,
  0.0,
  7.0,
  1.0,
  8.0,
  0.5,
  138.0,
  8.0,
  16.0,
  275.0,
  986.0,
  2563.0,
  19.0,
  17.0,
  19.0,
  1.0,
  0.0,
  0.0]]

In [52]:
sample_preds = predictor.predict(sample)
sample_preds

array([0, 2], dtype=int64)

In [53]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'ac1aa3a1-13e7-448f-b237-2478eb3e19bb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ac1aa3a1-13e7-448f-b237-2478eb3e19bb',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 25 Aug 2024 20:00:21 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}