In [140]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'mlprojects-raju'
print("Using bucket " + bucket)

Using bucket mlprojects-raju


In [141]:
df = pd.read_csv("s3://mlprojects-raju/airlinesdelay/Airlines.csv", nrows = 1000)

In [142]:
df.columns

Index(['id', 'Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek',
       'Time', 'Length', 'Delay', 'filght_time'],
      dtype='object')

In [143]:
df.shape

(1000, 10)

In [144]:
df.describe()

Unnamed: 0,id,Flight,DayOfWeek,Time,Length,Delay
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,2775.792,3.0,355.555,122.701,0.21
std,288.819436,2210.544811,0.0,51.89283,55.749885,0.407512
min,1.0,3.0,3.0,15.0,32.0,0.0
25%,250.75,765.75,3.0,360.0,82.0,0.0
50%,500.5,2204.0,3.0,360.0,113.0,0.0
75%,750.25,4383.5,3.0,375.0,150.0,0.0
max,1000.0,7799.0,3.0,390.0,410.0,1.0


In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           1000 non-null   int64 
 1   Airline      1000 non-null   object
 2   Flight       1000 non-null   int64 
 3   AirportFrom  1000 non-null   object
 4   AirportTo    1000 non-null   object
 5   DayOfWeek    1000 non-null   int64 
 6   Time         1000 non-null   int64 
 7   Length       1000 non-null   int64 
 8   Delay        1000 non-null   int64 
 9   filght_time  1000 non-null   object
dtypes: int64(6), object(4)
memory usage: 78.2+ KB


In [146]:
df.drop(columns=['id', 'filght_time'], inplace=True)

In [147]:
df.columns

Index(['Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Time',
       'Length', 'Delay'],
      dtype='object')

In [148]:
cat_cols = ['Airline', 'AirportFrom', 'AirportTo']
for col in cat_cols:
    df[col + '_encoded'], _ = pd.factorize(df[col])
df.drop(columns=cat_cols, inplace=True)

In [149]:
df.head()

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay,Airline_encoded,AirportFrom_encoded,AirportTo_encoded
0,269,3,15,205,1,0,0,0
1,1558,3,15,222,1,1,1,1
2,2400,3,20,165,1,2,2,2
3,2466,3,20,195,1,2,0,2
4,108,3,30,202,0,3,3,3


In [150]:
features = list(df.columns)

In [152]:
features

['Flight',
 'DayOfWeek',
 'Time',
 'Length',
 'Delay',
 'Airline_encoded',
 'AirportFrom_encoded',
 'AirportTo_encoded']

In [153]:
label = features.pop(-1)

In [154]:
label 

'AirportTo_encoded'

In [155]:
x = df[features]
y = df[label]

In [156]:
x

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay,Airline_encoded,AirportFrom_encoded
0,269,3,15,205,1,0,0
1,1558,3,15,222,1,1,1
2,2400,3,20,165,1,2,2
3,2466,3,20,195,1,2,0
4,108,3,30,202,0,3,3
...,...,...,...,...,...,...,...
995,1182,3,390,175,0,2,2
996,1637,3,390,185,0,2,22
997,201,3,390,385,0,2,82
998,2401,3,390,205,0,2,71


In [157]:
y

0       0
1       1
2       2
3       2
4       3
       ..
995    62
996     6
997    15
998    15
999     6
Name: AirportTo_encoded, Length: 1000, dtype: int64

In [158]:
x.shape

(1000, 7)

In [159]:
y.shape

(1000,)

In [160]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(800, 7)
(200, 7)
(800,)
(200,)


In [161]:
trainX = pd.DataFrame(x_train)
trainX[label] = y_train
testX = pd.DataFrame(x_test)
testX[label] = y_test

In [162]:
print(trainX.shape)

(800, 8)


In [163]:
print(trainX)

     Flight  DayOfWeek  Time  Length  Delay  Airline_encoded  \
29     4746          3   300      85      0                7   
535     675          3   362     113      0               13   
695    2323          3   370     102      0               11   
557     705          3   365     250      0                2   
836    1321          3   380      89      0                5   
..      ...        ...   ...     ...    ...              ...   
106     684          3   340     134      0                4   
270    1297          3   360     168      0                4   
860    4554          3   380      88      0                7   
435     337          3   360     137      0               13   
102     214          3   340      59      1                0   

     AirportFrom_encoded  AirportTo_encoded  
29                    12                  4  
535                  101                  6  
695                   34                 51  
557                   83                  2  
8

In [164]:
trainX.to_csv("train-v-1.csv", index=False)
testX.to_csv("test-v-1.csv", index=False)


In [165]:
sk_prefix = "airlinesdelay"
trainpath = sess.upload_data(
    path="train-v-1.csv",bucket=bucket, key_prefix=sk_prefix       
)
testpath = sess.upload_data(
    path="test-v-1.csv",bucket=bucket, key_prefix=sk_prefix   
)

In [166]:
print(trainpath)


s3://mlprojects-raju/airlinesdelay/train-v-1.csv


In [168]:
%%writefile script.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import pandas as pd
import numpy as np

def model_fn(model_dir):
    # Correctly join the path and load the model.
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()
    
    # Hyperparameters sent as command line arguments
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data directories and file names
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "./model"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "."))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST", "."))
    parser.add_argument("--train-file", type=str, default="train-v-1.csv")
    parser.add_argument("--test-file", type=str, default="test-v-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKlearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading data")
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print("Column order: ")
    print(features)
    print("Label column is: ", label)
    
    print("Data Shape: ")
    print("------- SHAPE OF TRAINING DATA -------")
    print(train_df[features].shape, train_df[label].shape)
    print("------- SHAPE OF TESTING DATA -------")
    print(test_df[features].shape, test_df[label].shape)
    
    print("Training RandomForest Model ...")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(train_df[features], train_df[label])
    
    # Ensure the model directory exists before saving the model
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir, exist_ok=True)
        
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at: " + model_path)
    
    y_pred_test = model.predict(test_df[features])
    test_acc = accuracy_score(test_df[label], y_pred_test)
    test_rep = classification_report(test_df[label], y_pred_test)
    
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print("Total Rows:", test_df.shape[0])
    print("[TESTING] Model Accuracy:", test_acc)
    print("[TESTING] Testing Report:")
    print(test_rep)

Overwriting script.py


In [169]:
from sagemaker.sklearn import SKLearn

FRAMEWORK_VERSION = "0.23-1"
role = "arn:aws:iam::225989361602:role/service-role/AmazonSageMaker-ExecutionRole-20250406T095913"
sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0
    },
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600
)

In [170]:
# launch training job, with asynchnous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

2025-04-15 17:49:26 Starting - Starting the training job...
2025-04-15 17:49:39 Starting - Preparing the instances for training...
2025-04-15 17:50:32 Downloading - Downloading the training image......
2025-04-15 17:51:23 Training - Training image download completed. Training in progress.
2025-04-15 17:51:23 Uploading - Uploading generated training model2025-04-15 17:51:16,788 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-04-15 17:51:16,792 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:16,837 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-04-15 17:51:17,009 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:17,022 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:17,034 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-

In [171]:
sklearn_estimator.latest_training_job.wait(logs=True)
artifacts = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifacts persisted at : " + artifacts)

2025-04-15 17:51:35 Starting - Preparing the instances for training
2025-04-15 17:51:35 Downloading - Downloading the training image
2025-04-15 17:51:35 Training - Training image download completed. Training in progress.
2025-04-15 17:51:35 Uploading - Uploading generated training model
2025-04-15 17:51:35 Completed - Training job completed2025-04-15 17:51:16,788 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-04-15 17:51:16,792 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:16,837 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-04-15 17:51:17,009 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:17,022 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:17,034 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-15 17:51:17

In [172]:
artifacts

's3://sagemaker-us-east-1-225989361602/RF-custom-sklearn-2025-04-15-17-49-23-858/output/model.tar.gz'

In [173]:
from sagemaker.sklearn import SKLearnModel
from time import gmtime, strftime

model_name = "Custome-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifacts,
    role = "arn:aws:iam::225989361602:role/service-role/AmazonSageMaker-ExecutionRole-20250406T095913",
    entry_point = "script.py",
    framework_version = FRAMEWORK_VERSION,
)

In [174]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x3001810c0>

In [175]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    endpoint_name=endpoint_name,
    
)

EndpointName=Custom-sklearn-model-2025-04-15-17-53-16


-----!

In [176]:
endpoint_name

'Custom-sklearn-model-2025-04-15-17-53-16'

In [177]:
testX[features][0:2].values.tolist()

[[7218, 3, 360, 100, 0, 12, 54], [2578, 3, 375, 120, 0, 4, 205]]

In [178]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[ 6 10]


In [179]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'de0ffa38-2b18-4cd0-9b54-bbc72cc13f88',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'de0ffa38-2b18-4cd0-9b54-bbc72cc13f88',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 15 Apr 2025 17:57:46 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}