In [81]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'mlprojects-raju'
print("Using bucket " + bucket)

Using bucket mlprojects-raju


In [None]:
df = pd.read_csv("s3://mlprojects-raju/airlinesdelay/Airlines.csv", nrows = 1000)

In [83]:
df.columns

Index(['id', 'Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek',
       'Time', 'Length', 'Delay', 'filght_time'],
      dtype='object')

In [84]:
df.shape

(539383, 10)

In [85]:
df.describe()

Unnamed: 0,id,Flight,DayOfWeek,Time,Length,Delay
count,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0
mean,269692.0,2427.92863,3.929668,802.728963,132.202007,0.445442
std,155706.604461,2067.429837,1.914664,278.045911,70.117016,0.497015
min,1.0,1.0,1.0,10.0,0.0,0.0
25%,134846.5,712.0,2.0,565.0,81.0,0.0
50%,269692.0,1809.0,4.0,795.0,115.0,0.0
75%,404537.5,3745.0,5.0,1035.0,162.0,1.0
max,539383.0,7814.0,7.0,1439.0,655.0,1.0


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539383 entries, 0 to 539382
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           539383 non-null  int64 
 1   Airline      539383 non-null  object
 2   Flight       539383 non-null  int64 
 3   AirportFrom  539383 non-null  object
 4   AirportTo    539383 non-null  object
 5   DayOfWeek    539383 non-null  int64 
 6   Time         539383 non-null  int64 
 7   Length       539383 non-null  int64 
 8   Delay        539383 non-null  int64 
 9   filght_time  539383 non-null  object
dtypes: int64(6), object(4)
memory usage: 41.2+ MB


In [87]:
df.drop(columns=['id', 'filght_time'], inplace=True)

In [88]:
df.columns

Index(['Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Time',
       'Length', 'Delay'],
      dtype='object')

In [99]:
cat_cols = ['Airline', 'AirportFrom', 'AirportTo']
for col in cat_cols:
    df[col + '_encoded'], _ = pd.factorize(df[col])
df.drop(columns=cat_cols, inplace=True)

In [102]:
df.head()

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay,Airline_encoded,AirportFrom_encoded,AirportTo_encoded
0,269,3,15,205,1,0,0,0
1,1558,3,15,222,1,1,1,1
2,2400,3,20,165,1,2,2,2
3,2466,3,20,195,1,2,0,2
4,108,3,30,202,0,3,3,3


In [103]:
features = list(df.columns)

In [104]:
features

['Flight',
 'DayOfWeek',
 'Time',
 'Length',
 'Delay',
 'Airline_encoded',
 'AirportFrom_encoded',
 'AirportTo_encoded']

In [105]:
label = features.pop(-1)

In [106]:
label 

'AirportTo_encoded'

In [107]:
x = df[features]
y = df[label]

In [108]:
x

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay,Airline_encoded,AirportFrom_encoded
0,269,3,15,205,1,0,0
1,1558,3,15,222,1,1,1
2,2400,3,20,165,1,2,2
3,2466,3,20,195,1,2,0
4,108,3,30,202,0,3,3
...,...,...,...,...,...,...,...
539378,178,5,1439,326,0,0,193
539379,398,5,1439,305,0,15,21
539380,609,5,1439,255,0,15,0
539381,78,5,1439,313,1,13,11


In [109]:
y

0          0
1          1
2          2
3          2
4          3
          ..
539378    63
539379     7
539380    30
539381    18
539382    22
Name: AirportTo_encoded, Length: 539383, dtype: int64

In [111]:
x.shape

(539383, 7)

In [112]:
y.shape

(539383,)

In [113]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(431506, 7)
(107877, 7)
(431506,)
(107877,)


In [114]:
trainX = pd.DataFrame(x_train)
trainX[label] = y_train
testX = pd.DataFrame(x_test)
testX[label] = y_test

In [115]:
print(trainX.shape)

(431506, 8)


In [116]:
print(trainX)

        Flight  DayOfWeek  Time  Length  Delay  Airline_encoded  \
498081     796          3   900     143      0               16   
203828    3586          7  1185      80      1               17   
156549     705          5   365     250      1                2   
66053     2825          6  1090     207      0                4   
165376    1102          5   818     150      1               15   
...        ...        ...   ...     ...    ...              ...   
110268      20          2   715     403      0                1   
259178    6401          4   375     125      1                7   
365838    6594          3   360     160      0                9   
131932     938          3   955     144      0               13   
121958    2952          3   420     159      1                4   

        AirportFrom_encoded  AirportTo_encoded  
498081                   55                 24  
203828                  205                 64  
156549                   83                  2  

In [117]:
trainX.to_csv("train-v-1.csv", index=False)
testX.to_csv("test-v-1.csv", index=False)


In [118]:
sk_prefix = "airlinesdelay"
trainpath = sess.upload_data(
    path="train-v-1.csv",bucket=bucket, key_prefix=sk_prefix       
)
testpath = sess.upload_data(
    path="test-v-1.csv",bucket=bucket, key_prefix=sk_prefix   
)

In [119]:
print(trainpath)


s3://mlprojects-raju/airlinesdelay/train-v-1.csv


In [125]:
%%writefile script.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import pandas as pd
import numpy as np

def model_fn(model_dir):
    # Correctly join the path and load the model.
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()
    
    # Hyperparameters sent as command line arguments
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data directories and file names
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "./model"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "."))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST", "."))
    parser.add_argument("--train-file", type=str, default="train-v-1.csv")
    parser.add_argument("--test-file", type=str, default="test-v-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKlearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading data")
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print("Column order: ")
    print(features)
    print("Label column is: ", label)
    
    print("Data Shape: ")
    print("------- SHAPE OF TRAINING DATA -------")
    print(train_df[features].shape, train_df[label].shape)
    print("------- SHAPE OF TESTING DATA -------")
    print(test_df[features].shape, test_df[label].shape)
    
    print("Training RandomForest Model ...")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(train_df[features], train_df[label])
    
    # Ensure the model directory exists before saving the model
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir, exist_ok=True)
        
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at: " + model_path)
    
    y_pred_test = model.predict(test_df[features])
    test_acc = accuracy_score(test_df[label], y_pred_test)
    test_rep = classification_report(test_df[label], y_pred_test)
    
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print("Total Rows:", test_df.shape[0])
    print("[TESTING] Model Accuracy:", test_acc)
    print("[TESTING] Testing Report:")
    print(test_rep)

Overwriting script.py


In [130]:
from sagemaker.sklearn import SKLearn

FRAMEWORK_VERSION = "0.23-1"
role = "arn:aws:iam::225989361602:role/service-role/AmazonSageMaker-ExecutionRole-20250406T095913"
sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.4xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0
    },
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600
)

In [131]:
# launch training job, with asynchnous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

2025-04-15 15:01:33 Starting - Starting the training job...
2025-04-15 15:01:40 Starting - Insufficient capacity error from EC2 while launching instances, retrying!.........................................................
2025-04-15 15:11:52 Starting - Preparing the instances for training...
2025-04-15 15:12:31 Downloading - Downloading the training image...
2025-04-15 15:12:56 Training - Training image download completed. Training in progress....2025-04-15 15:13:29,303 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-04-15 15:13:29,306 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29,345 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-04-15 15:13:29,540 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29,553 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:

In [132]:
sklearn_estimator.latest_training_job.wait(logs=True)
artifacts = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifacts persisted at : " + artifacts)

2025-04-15 15:20:31 Starting - Preparing the instances for training
2025-04-15 15:20:31 Downloading - Downloading the training image
2025-04-15 15:20:31 Training - Training image download completed. Training in progress.
2025-04-15 15:20:31 Uploading - Uploading generated training model
2025-04-15 15:20:31 Completed - Training job completed2025-04-15 15:13:29,303 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-04-15 15:13:29,306 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29,345 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-04-15 15:13:29,540 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29,553 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29,564 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 15:13:29

In [133]:
artifacts

's3://sagemaker-us-east-1-225989361602/RF-custom-sklearn-2025-04-15-15-01-28-413/output/model.tar.gz'

In [134]:
from sagemaker.sklearn import SKLearnModel
from time import gmtime, strftime

model_name = "Custome-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifacts,
    role = "arn:aws:iam::225989361602:role/service-role/AmazonSageMaker-ExecutionRole-20250406T095913",
    entry_point = "script.py",
    framework_version = FRAMEWORK_VERSION,
)

In [135]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x17f3e9d80>

In [137]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    endpoint_name=endpoint_name,
    
)

EndpointName=Custom-sklearn-model-2025-04-15-15-33-04


----------------------------------------------------------*

In [None]:
endpoint_name

In [None]:
testX[features][0:2].values.tolist()

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

In [138]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)