# Mobile price classification

In [4]:
import sagemaker
import pandas as pd
import boto3
from sklearn.model_selection import train_test_split

Sagemaker = boto3.client("sagemaker")
session = sagemaker.Session()
region = session.boto_session.region_name

bucket = 'mlmobilepriceclassificationbucket' 

In [9]:
df = pd.read_csv('../data/mobile_price_classification_data.csv')

In [10]:
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [13]:
df["price_range"].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [15]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [20]:
df.isnull().mean()*100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [23]:
features = list(df.columns)
features                

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [24]:
price_label = features.pop(-1)
price_label

'price_range'

In [27]:
x = df[features]
y = df[price_label]

In [28]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [29]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.15, random_state=0)

In [42]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1700, 20), (1700,), (300, 20), (300,))

In [63]:
y_train

1452    1
1044    1
1279    3
674     0
1200    0
       ..
835     3
1216    1
1653    3
559     0
684     1
Name: price_range, Length: 1700, dtype: int64

In [50]:
trainData = pd.DataFrame(x_train)
trainData[price_label] = y_train
trainData.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1452,1450,0,2.1,0,1,0,31,0.6,114,5,...,1573,1639,794,11,5,9,0,1,1,1
1044,1218,1,2.8,1,3,0,39,0.8,150,7,...,1122,1746,1667,10,0,12,0,0,0,1
1279,1602,0,0.6,0,12,0,58,0.4,170,1,...,1259,1746,3622,17,2,17,0,1,1,3
674,1034,0,2.6,1,2,1,45,0.3,190,3,...,182,1293,969,15,1,7,1,0,0,0
1200,530,0,2.4,0,1,0,32,0.3,88,6,...,48,1012,959,17,7,6,0,1,0,0


In [49]:
testData = pd.DataFrame(x_test)
testData[price_label] = y_test
testData.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
405,1454,1,0.5,1,1,0,34,0.7,83,4,...,250,1033,3419,7,5,5,1,1,0,3
1190,1092,1,0.5,1,10,0,11,0.5,167,3,...,468,571,737,14,4,11,0,1,0,0
1132,1524,1,1.8,1,0,0,10,0.6,174,4,...,154,550,2678,16,5,13,1,0,1,2
731,1807,1,2.1,0,2,0,49,0.8,125,1,...,337,1384,1906,17,13,13,0,1,1,2
1754,1086,1,1.7,1,0,1,43,0.2,111,6,...,56,1150,3285,11,5,17,1,1,0,2


In [52]:
trainData.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [53]:
testData.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [56]:
trainData.to_csv("train-data-V-1.csv",index = False)
testData.to_csv("test-data-V-1.csv", index = False)

In [57]:
bucket

'mlmobilepriceclassificationbucket'

In [58]:
# upload to s3 bucket
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = session.upload_data(
    path="train-data-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = session.upload_data(
    path="test-data-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

In [95]:
testpath

's3://mlmobilepriceclassificationbucket/sagemaker/mobile_price_classification/sklearncontainer/test-data-V-1.csv'

In [99]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
  

    # Data, model, and output directories
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-data-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-data-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [100]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '1.0-1'

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role= 'arn:aws:iam::512195023911:role/sagemaker-role-mobproject',
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
        hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [101]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-04-14-18-26-57-928


2024-04-14 18:27:07 Starting - Starting the training job...
2024-04-14 18:27:24 Starting - Preparing the instances for training...
2024-04-14 18:28:21 Downloading - Downloading the training image.....2024-04-14 18:29:21,753 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-04-14 18:29:21,756 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-14 18:29:21,758 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-04-14 18:29:21,775 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-04-14 18:29:21,985 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-14 18:29:21,989 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-04-14 18:29:22,006 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-14 18:29:22,009 sagemaker-training

In [102]:
sklearn_estimator.latest_training_job.wait(logs="None")


2024-04-14 18:29:43 Starting - Preparing the instances for training
2024-04-14 18:29:43 Downloading - Downloading the training image
2024-04-14 18:29:43 Training - Training image download completed. Training in progress.
2024-04-14 18:29:43 Uploading - Uploading generated training model
2024-04-14 18:29:43 Completed - Training job completed


In [103]:
artifact = Sagemaker.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

Model artifact persisted at s3://sagemaker-us-east-1-512195023911/RF-custom-sklearn-2024-04-14-18-26-57-928/output/model.tar.gz


In [104]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::512195023911:role/sagemaker-role-mobproject",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [106]:
model_name

'Custom-sklearn-model-2024-04-14-18-46-31'

In [107]:

##Endpoints deployment
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2024-04-14-18-47-15


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-04-14-18-46-31
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-04-14-18-47-15
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-04-14-18-47-15


------!

In [113]:
testData[features][3:5].values.tolist()

[[1807.0,
  1.0,
  2.1,
  0.0,
  2.0,
  0.0,
  49.0,
  0.8,
  125.0,
  1.0,
  10.0,
  337.0,
  1384.0,
  1906.0,
  17.0,
  13.0,
  13.0,
  0.0,
  1.0,
  1.0],
 [1086.0,
  1.0,
  1.7,
  1.0,
  0.0,
  1.0,
  43.0,
  0.2,
  111.0,
  6.0,
  1.0,
  56.0,
  1150.0,
  3285.0,
  11.0,
  5.0,
  17.0,
  1.0,
  1.0,
  0.0]]

In [114]:
print(predictor.predict(testData[features][3:5].values.tolist()))

[1 3]


In [116]:
Sagemaker.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '6044b8f2-68db-4bb3-8810-8fd72c9cf762',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6044b8f2-68db-4bb3-8810-8fd72c9cf762',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 14 Apr 2024 18:58:00 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}