In [2]:
import sagemaker
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'sagunprojectbucket'
print('Using bucket ',bucket)

Using bucket  sagunprojectbucket


In [5]:
df = pd.read_csv('data_to_train/calorie_data_for_model.csv')

In [11]:
df

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.50,8.50,0.0,1.88,0.55,6.06,0.00,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.00,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.40,3.91,0.00,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.00,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.00,36,10,221,773,1863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864,8877689391,5/5/2016,14055,10.67,10.67,0.0,5.46,0.82,4.37,0.00,67,15,188,1170,3052
865,8877689391,5/6/2016,21727,19.34,19.34,0.0,12.79,0.29,6.16,0.00,96,17,232,1095,4015
866,8877689391,5/7/2016,12332,8.13,8.13,0.0,0.08,0.96,6.99,0.00,105,28,271,1036,4142
867,8877689391,5/8/2016,10686,8.11,8.11,0.0,1.08,0.20,6.80,0.00,17,4,245,1174,2847


In [12]:
df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [13]:
df.shape

(869, 15)

In [14]:
df.columns

Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
       'LoggedActivitiesDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
      dtype='object')

In [15]:
df.shape

(869, 15)

In [6]:
features = list(df.columns)

In [7]:
label = features.pop(-1)

In [8]:
x = df[features]
y = df[label]

In [9]:
y.shape, x.shape

((869,), (869, 14))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15,random_state=0)

In [11]:
X_train.shape, y_train.shape, X_test.shape

((738, 14), (738,), (131, 14))

In [12]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

In [13]:
testX = pd.DataFrame(X_test)
testX[label] = y_test

In [22]:
trainX.to_csv("traindata.csv",index=False)
testX.to_csv("testdata.csv",index=False)

In [32]:
sk_prefix = 'sagemaker/calorie_prediction/sklearncontainer'
trainpath = sess.upload_data(path="traindata.csv",bucket=bucket,key_prefix=sk_prefix)
testpath = sess.upload_data(path="testdata.csv",bucket=bucket,key_prefix=sk_prefix)

In [33]:
%%writefile script.py

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="traindata.csv")
    parser.add_argument("--test-file", type=str, default="testdata.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestRegressor(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [39]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::339713046634:role/sagemaker_role",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [40]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-04-17-02-52-32-389


2024-04-17 02:52:38 Starting - Starting the training job...
2024-04-17 02:52:53 Starting - Preparing the instances for training...
2024-04-17 02:53:26 Downloading - Downloading input data...
2024-04-17 02:53:52 Downloading - Downloading the training image...
2024-04-17 02:54:53 Training - Training image download completed. Training in progress.
2024-04-17 02:54:53 Uploading - Uploading generated training model
2024-04-17 02:54:53 Failed - Training job failed
2024-04-17 02:54:36,323 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-04-17 02:54:36,326 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-17 02:54:36,362 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-04-17 02:54:36,523 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-17 02:54:36,535 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-

UnexpectedStatusException: Error for Training job RF-custom-sklearn-2024-04-17-02-52-32-389: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python script.py --n_estimators 100 --r