#### What We will Learn

1. S3 Buckets- Boto3
2. Iam Roles and Users
3. Complete Infrastructure of AWS Sagemaker-Training, Endpoints

In [None]:
# Setup imports: sagemaker for training/deployment, boto3 for low-level AWS, pandas for data, os for env
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd
import os

# Create a low-level SageMaker boto3 client for describing jobs/artifacts
sm_boto3 = boto3.client("sagemaker")

# Create a SageMaker Session object; provide a compatibility fallback for older versions
try:
    sess = sagemaker.Session()
except Exception:
    from sagemaker.core.helper.session_helper import Session as SageMakerSession
    sess = SageMakerSession()

# Get current AWS region and set bucket name used in examples
region = sess.boto_session.region_name
bucket = "mobbucketsagemakerv1"

# Use `SAGEMAKER_INSTANCE_TYPE` env var to switch between local and AWS execution
instance_type = os.environ.get("SAGEMAKER_INSTANCE_TYPE", "local")
print("Using bucket " + bucket)

In [None]:
# Print the AWS region determined from the SageMaker session
print(region)

In [None]:
# Load the raw dataset into a DataFrame and show the first rows for inspection
df = pd.read_csv("mob_price_classification_train.csv")
df.head()

In [None]:
# Display the shape of the dataset (rows, columns)
df.shape

In [None]:
# Check for missing values in each column to detect nulls or data issues
df.isnull().sum()

In [None]:
# Inspect class distribution for the target column 'price_range'
df['price_range'].value_counts()

In [None]:
# List feature columns in the dataset (will be used for modeling)
features = list(df.columns)
features

In [None]:
# Assume the last column is the label/target and remove it from the features list
label = features.pop(-1)
label

In [None]:
# Verify remaining feature column names
features

In [None]:
# Split dataset into feature matrix X and target vector y
x = df[features]
y = df[label]

In [None]:
# Create train/test split (85% train, 15% test) for local experimentation
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=0)

In [None]:
# Print shapes of the train and test splits to confirm sizes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Reconstruct DataFrames with labels appended so we can write CSVs with the target column
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [None]:
# Display an example of the training DataFrame (features + label)
trainX

In [None]:
# Save the training and testing CSVs used by the training job or local runs
trainX.to_csv("train-V-1.csv", index=False)
testX.to_csv("test-V-1.csv", index=False)

In [None]:
# Show the bucket variable defined earlier (for verification)
bucket

In [None]:
# Prepare S3 paths for training data when running on AWS; use file:// URIs for local runs
sk_prefix = "sagemaker/sklearn-mob-price-classification/sklearncontainer"

if not str(instance_type).startswith("local"):
    # Upload CSVs to S3 and capture the S3 URIs for the estimator
    trainpath = sess.upload_data(path='train-V-1.csv', bucket=bucket, key_prefix=sk_prefix)
    testpath = sess.upload_data(path='test-V-1.csv', bucket=bucket, key_prefix=sk_prefix)
    print(trainpath)
    print(testpath)
else:
    # local mode: use file:// paths for the estimator.fit fallback
    trainpath = "file://train-V-1.csv"
    testpath = "file://test-V-1.csv"
    print("Local mode: using", trainpath, testpath)

#### Script used by AWS Sagemaker To Train Models

In [None]:
%%writefile script.py
"""
Training script for a RandomForest classifier used with SageMaker/local runs.

This module trains a RandomForest model using CSV train/test files, saves
the trained model to `model.joblib` under the provided model directory, and
prints basic evaluation metrics on the test set.

The script expects the following (can be provided via SageMaker env vars):
- `SM_MODEL_DIR` -> model output directory
- `SM_CHANNEL_TRAIN` -> path to training data channel
- `SM_CHANNEL_TEST` -> path to testing data channel

Run as a script for local testing, or used by SageMaker during training.
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import argparse
import os
import numpy as np
import pandas as pd


def model_fn(model_dir):
    """Load and return the trained model from `model_dir`.

    This function follows the SageMaker inference convention where the
    serving/container runtime calls `model_fn` to deserialize the model.

    Args:
        model_dir (str): Directory where `model.joblib` is stored.

    Returns:
        sklearn estimator: The deserialized model object.
    """
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":
    # Parse input arguments (hyperparameters and channel locations)
    print("[Info] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters for the RandomForest model
    parser.add_argument("--n_estimators", type=int, default=100, help="Number of trees in the forest")
    parser.add_argument("--random_state", type=int, default=0, help="Random seed for reproducibility")

    # Directories: model output and data channels (SageMaker style env vars)
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"), help="Model output directory")
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"), help="Training data channel path")
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"), help="Testing data channel path")
    parser.add_argument("--train-file", type=str, default="train-V-1.csv", help="Training CSV file name")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv", help="Testing CSV file name")

    args, _ = parser.parse_known_args()

    # Report versions for reproducibility/debugging
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    # Read CSV data from provided channels
    print("[INFO] Reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    # Last column is assumed to be the label; all preceding columns are features
    features = list(train_df.columns)
    label = features.pop(-1)

    print("Building training and testing datasets")
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    # Print dataset summary information
    print('Column order: ')
    print(features)
    print()
    print("Label column is: ", label)
    print()

    print("Data Shape: ")
    print("---- SHAPE OF TRAINING DATA (rows, cols) ----")
    print(X_train.shape)
    print(y_train.shape)
    print("---- SHAPE OF TESTING DATA (rows, cols) ----")
    print(X_test.shape)
    print(y_test.shape)

    # Initialize and train the RandomForest model
    print("Training RandomForest Model ....")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state,
                                   verbose=2, n_jobs=1)

    model.fit(X_train, y_train)

    # Save the trained model for later inference
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at " + model_path)

    # Evaluate on the test set and print metrics
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

### AWS Sagemaker Entry Point To Execute the Training script

In [None]:
# Configure an SKLearn estimator that will use `script.py` as the entry point
from sagemaker.sklearn.estimator import SKLearn
import os

FRAMEWORK_VERSION = "0.23-1"

# Use local mode by default for development/testing. To run on AWS change env var SAGEMAKER_INSTANCE_TYPE
instance_type = os.environ.get("SAGEMAKER_INSTANCE_TYPE", "local")

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::411715192815:role/sagemakeraccess",
    instance_count=1,
    instance_type=instance_type,
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0
    },
    # Spot instances not supported for local mode
    use_spot_instance=(False if instance_type.startswith("local") else True),
    max_run=3600
)


In [None]:
# Launch the training job. Use local file URIs if running in local mode, otherwise pass S3 paths
if sklearn_estimator.instance_type and str(sklearn_estimator.instance_type).startswith("local"):
    fit_inputs = {"train": "file://train-V-1.csv", "test": "file://test-V-1.csv"}
else:
    fit_inputs = {"train": trainpath, "test": testpath}

# Start the training job and wait for completion
sklearn_estimator.fit(fit_inputs, wait=True)


### To get the model from S3

In [None]:
# Retrieve the trained model artifact. For local training, package and upload the model to S3
sklearn_estimator.latest_training_job.wait(logs="None")

if str(instance_type).startswith("local"):
    import glob, tempfile, tarfile
    import boto3
    import os

    # attempt to find the saved model produced by the training container
    matches = glob.glob("**/model.joblib", recursive=True)
    if not matches:
        matches = glob.glob(os.path.join(tempfile.gettempdir(), "**/model.joblib"), recursive=True)
    if not matches:
        raise Exception("model.joblib not found after local training")
    model_path = matches[0]

    # package the model into a tar.gz expected by SageMaker
    tar_path = os.path.join(os.getcwd(), "model.tar.gz")
    with tarfile.open(tar_path, "w:gz") as tar:
        tar.add(model_path, arcname="model.joblib")

    # upload to S3 so we can create a model for deployment
    s3 = boto3.client("s3")
    # ensure sk_prefix exists
    try:
        sk_prefix
    except NameError:
        sk_prefix = "sagemaker/sklearn-mob-price-classification/sklearncontainer"
    s3_key = f"{sk_prefix}/model.tar.gz"
    s3.upload_file(tar_path, bucket, s3_key)
    artifact = f"s3://{bucket}/{s3_key}"
    print("Uploaded local model to", artifact)
else:
    artifact = sm_boto3.describe_training_job(
        TrainingJobName=sklearn_estimator.latest_training_job.name
    )["ModelArtifacts"]["S3ModelArtifacts"]

# The variable `artifact` contains the S3 URI for the trained model
artifact


In [None]:
# Display the artifact S3 URI pointing to the trained model
artifact

### Deploy the Model For Endpoint

In [None]:
# Create an SKLearnModel object for deployment using the trained artifact
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::411715192815:role/sagemakeraccess",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)


In [None]:
# Inspect the model object prepared for deployment
model

In [None]:
# Deploy the model to an endpoint (this will create an endpoint in AWS unless running in local mode)
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)


In [None]:
# Show the predictor object which can be used to make real-time inferences
predictor

In [None]:
# Show first two rows of features from test set to prepare sample payload for prediction
testX[features][0:2]

In [None]:
# Make predictions using the deployed endpoint for two sample rows from the test set
print(predictor.predict(testX[features][:2].values.tolist()))

In [None]:
# Delete the deployed endpoint to avoid incurring charges in AWS
sm_boto3.delete_endpoint(EndpointName=endpoint_name)