In [1]:
import sagemaker
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd

sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "mymobpricebucket"
print('Using bucket ' + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\Sayantan Datta\AppData\Local\sagemaker\sagemaker\config.yaml
Using bucket mymobpricebucket


In [2]:
df = pd.read_csv(r"C:\Users\Sayantan Datta\Desktop\Personal Projects\AWS Sagemaker\train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
df.shape

(2000, 21)

In [4]:
df['price_range'].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [7]:
features = df.columns.tolist()
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [8]:
label = features.pop(-1)
label

'price_range'

In [9]:
x = df[features]
y = df[label]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [12]:
trainX = pd.DataFrame(x_train)
trainX[label] = y_train

testX = pd.DataFrame(x_test)
testX[label] = y_test

In [13]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [14]:
testX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [15]:
trainX.to_csv('trainV1.csv', index=False)
testX.to_csv('testV1.csv', index=False)

In [16]:
#Creating a folder in S3 bucket and uploading the train and test data to it

sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'

trainpath = sess.upload_data(
    path='trainV1.csv', bucket=bucket,
    key_prefix=sk_prefix
    )

testpath = sess.upload_data(
    path='testV1.csv', bucket=bucket,
    key_prefix=sk_prefix
    )

In [17]:
print(trainpath)
print(testpath)

s3://mymobpricebucket/sagemaker/mobile_price_classification/sklearncontainer/trainV1.csv
s3://mymobpricebucket/sagemaker/mobile_price_classification/sklearncontainer/testV1.csv


In [18]:
#File to be used by sagemaker
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import pandas as pd
import os
import argparse
import numpy as np

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__=='__main__':
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    #arguments for RFC model
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)

    #arguments for sagemaker
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train_file', type=str, default='trainV1.csv')
    parser.add_argument('--test_file', type=str, default='testV1.csv')

    args, _ = parser.parse_known_args()

    print("[INFO] Reading data")
    print()

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = train_df.columns.tolist()
    label = features.pop(-1)

    print("Building training and testing datasets")
    print()

    x_train = train_df[features]
    x_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("column order: ")
    print(features)
    print()

    print("Label column: ")
    print(label)
    print()

    print("Data Shape:")
    print()
    print("---TRAINING DATA 80%---")
    print(x_train.shape)
    print(y_train.shape)
    print()
    print("---TEST DATA 20%---")
    print(x_test.shape)
    print(y_test.shape)

    print("Training Random Forest Classifier")
    print()

    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(x_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    print("Model saved at: {}", model_path)

    y_pred_test = model.predict(x_test)
    tes_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("---METRICS RESULTA FOR TEST DATA---")   
    print()
    print("Total rows are: ", x_test.shape[0])
    print("Accuracy: ", tes_acc)
    print("Testing Report: ")
    print(test_rep)

Writing script.py
