In [12]:
!pip install sagemaker
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd
session = boto3.Session()
sm_boto3 = session.client("sagemaker")
sess = sagemaker.Session(boto_session=session)
region = sess.boto_session.region_name
bucket = 'ec2-utilization-sagemaker-model' # Created S3 bucket name here
print("Using bucket " + bucket)

credentials = sess.boto_session.get_credentials()
frozen_credentials = credentials.get_frozen_credentials()





Using bucket ec2-utilization-sagemaker-model


In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Load dataset
data = pd.read_json('generated_records.json')

# Convert to DataFrame
df = pd.DataFrame(data)

# Expand 'Metrics' column
metrics_df = pd.DataFrame(df['Metrics'].tolist())
df = pd.concat([df, metrics_df], axis=1)
df = df.drop('Metrics', axis=1)  # Remove original "Metrics" column

# Handle missing data with KNN imputer
imputer = KNNImputer(n_neighbors=2)
df[['CPUUtilization', 'NetworkIn', 'NetworkOut']] = imputer.fit_transform(df[['CPUUtilization', 'NetworkIn', 'NetworkOut']])

# Feature scaling
'''scaler = StandardScaler()
df[['CPUUtilization', 'DiskReadOps', 'DiskWriteOps', 'NetworkIn', 'NetworkOut']] = scaler.fit_transform(df[['CPUUtilization', 'DiskReadOps', 'DiskWriteOps', 'NetworkIn', 'NetworkOut']])

# Encoding the target variable (InstanceType)
le = LabelEncoder() 
df['InstanceType'] = le.fit_transform(df['InstanceType'])'''

# Features and target variable
X = df[['CPUUtilization', 'DiskReadOps', 'DiskWriteOps', 'NetworkIn', 'NetworkOut']]
y = df['InstanceType']
y = [str(label).strip().lower() for label in y]
# Fit on full dataset first
#encoder = LabelEncoder()
#y_encoded = encoder.fit_transform(y)

#print("Loaded encoder classes:", encoder.classes_)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


print("✅ y_train unique labels:", set(y_train))
print("✅ y_test unique labels:", set(y_test))

X_train.to_csv("X_train-V-1.csv",index = False)

X_test.to_csv("X_test-V-1.csv", index = False)
#y_train.to_csv("y_train-V-1.csv",index = False)
#y_test.to_csv("y_test-V-1.csv", index = False)
pd.DataFrame(y_train).to_csv("y_train-V-1.csv", index=False)
pd.DataFrame(y_test).to_csv("y_test-V-1.csv", index=False)

# Send data to S3. SageMaker will take training data from s3
train_sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer/train"
test_sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer/test"
X_trainpath = sess.upload_data(
    path="X_train-V-1.csv", bucket=bucket, key_prefix=train_sk_prefix
)

X_testpath = sess.upload_data(
    path="X_test-V-1.csv", bucket=bucket, key_prefix=test_sk_prefix
)
y_trainpath = sess.upload_data(
    path="y_train-V-1.csv", bucket=bucket, key_prefix=train_sk_prefix
)

y_testpath = sess.upload_data(
    path="y_test-V-1.csv", bucket=bucket, key_prefix=test_sk_prefix
)
print(X_trainpath)
print(X_testpath)
print(y_trainpath)
print(y_testpath)

✅ y_train unique labels: {'t3.medium', 't3.large', 'm5.4xlarge', 't3a.micro', 't3.small'}
✅ y_test unique labels: {'t3.medium', 't3.large', 'm5.4xlarge', 't3a.micro', 't3.small'}
s3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/train/X_train-V-1.csv
s3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/test/X_test-V-1.csv
s3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/train/y_train-V-1.csv
s3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/test/y_test-V-1.csv


In [4]:
%%writefile script.py
import pandas as pd
import pickle
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import joblib
import logging
import argparse
import os
import ast
import boto3
import numpy as np
from botocore.exceptions import NoCredentialsError, ClientError

# Logging setup for better tracking on SageMaker
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

s3_client = boto3.client('s3')
# Helper function to safely parse lists from strings
def safe_eval(param_str):
    try:
        return ast.literal_eval(param_str)
    except (ValueError, SyntaxError):
        raise ValueError(f"Invalid parameter format: {param_str}")
        
# Function to check if a file exists in S3
def check_s3_file_exists(bucket, file_key):
    try:
        s3_client.head_object(Bucket=bucket, Key=file_key)
        logger.info(f"File exists: s3://{bucket}/{file_key}")
        return True
    except ClientError as e:
        logger.error(f"File not found: s3://{bucket}/{file_key} - {e}")
        return False
    
train_file_check = check_s3_file_exists("ec2-utilization-sagemaker-model", f"sagemaker/mobile_price_classification/sklearncontainer/X_train-V-1.csv")
if not train_file_check:
    raise FileNotFoundError(f"Training file X_train-V-1.csv not found in S3 path {args.train}")



# Model loading function for SageMaker
def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model
# Model loading redict function for SageMaker    
def predict_fn(input_data, model):
    # Load the scaler and encoder (or you can modify the code to load them only once if needed)
    scaler = joblib.load(os.path.join(os.getenv("SM_MODEL_DIR"), "scaler.joblib"))
    encoder = joblib.load(os.path.join(os.getenv("SM_MODEL_DIR"), "label_encoder.joblib"))
    model_dir = os.getenv("SM_MODEL_DIR", "/opt/ml/model")
    logger.info(f"Checking files in: {model_dir}")
    logger.info(f"Files in model directory: {os.listdir(model_dir)}")
    logger.info(f"Loaded encoder classes: {encoder.classes_}")
    # Preprocess the input data: scale it
    if isinstance(input_data, dict):
        input_data = pd.DataFrame([input_data])
    input_data_scaled = scaler.transform(input_data)

    # Make prediction using the trained model
    prediction = model.predict(input_data_scaled)
    logger.info(f"Raw prediction output: {prediction}")

    

    # Decode the predicted label back to the original label
    #predicted_label = encoder.inverse_transform(prediction)
    '''try:
        predicted_label = encoder.inverse_transform(prediction)
    except ValueError as e:
        logger.error(f"Decoding error — unseen label issue: {e}")
        logger.error(f"Prediction values: {prediction}")
        logger.error(f"Encoder classes: {encoder.classes_}")
        predicted_label = ["unknown"]'''

    return prediction

# Main script execution
if __name__ == "__main__":

    logger.info("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters passed via command-line arguments (for Random Forest)
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--max_depth", type=int, default=20)
    parser.add_argument("--min_samples_split", type=int, default=2)
    parser.add_argument("--min_samples_leaf", type=int, default=1)

    # Directories for model, train, test, etc.
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) 
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) 
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST")) 
    parser.add_argument("--X-train-file", type=str, default="X_train-V-1.csv")
    parser.add_argument("--X-test-file", type=str, default="X_test-V-1.csv")
    parser.add_argument("--y-train-file", type=str, default="y_train-V-1.csv")
    parser.add_argument("--y-test-file", type=str, default="y_test-V-1.csv")
    

    args = parser.parse_args()

    # Parse parameters with safe_eval
    #n_estimators = safe_eval(args.n_estimators)
    #max_depth = safe_eval(args.max_depth)
    #min_samples_split = safe_eval(args.min_samples_split)
    #min_samples_leaf = safe_eval(args.min_samples_leaf)
    n_estimators = args.n_estimators
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    min_samples_leaf = args.min_samples_leaf

    # Check versions for logging
    logger.info("SKLearn Version: %s", sklearn.__version__)
    logger.info("Joblib Version: %s", joblib.__version__)

    logger.info("[INFO] Reading data")
    # Safely load data
    
    try:
        X_train = pd.read_csv(os.path.join(args.train, args.X_train_file))
        y_train = pd.read_csv(os.path.join(args.train, args.y_train_file))
        X_test = pd.read_csv(os.path.join(args.test, args.X_test_file))
        y_test = pd.read_csv(os.path.join(args.test, args.y_test_file))
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

    # Validate shapes of datasets
    if X_train.shape[0] != y_train.shape[0]:
        raise ValueError("Mismatch: X_train and y_train row counts are different!")
    if X_test.shape[0] != y_test.shape[0]:
        raise ValueError("Mismatch: X_test and y_test row counts are different!")

    # Define the param grid for GridSearchCV
    param_grid = {
        'n_estimators': [n_estimators],
        'max_depth': [max_depth],
        'min_samples_split': [min_samples_split],
        'min_samples_leaf': [min_samples_leaf]
    }

    logger.info("Data Shape:")
    logger.info("---- SHAPE OF TRAINING DATA (85%%) ---- %s", str(X_train.shape))
    logger.info("---- SHAPE OF TESTING DATA (15%%) ---- %s", str(X_test.shape))

    logger.info("Training RandomForest Model.....")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    logger.info("Scalar completed.......")
    # Encoding target
    #encoder = LabelEncoder()
    #y_train_encoded = encoder.fit_transform(y_train)
    #y_test_encoded = encoder.transform(y_test)
    y_train = y_train.ravel() if hasattr(y_train, 'ravel') else np.array(y_train).flatten()
    y_test = y_test.ravel() if hasattr(y_test, 'ravel') else np.array(y_test).flatten()
    # Fit and transform the encoder
    encoder = LabelEncoder()
    encoder.fit(np.concatenate([y_train, y_test]))  # Fit on both train and test labels

    # Now transform separately
    y_train_encoded = encoder.transform(y_train)
    y_test_encoded = encoder.transform(y_test)

    logger.info(f"y_train unique labels: {set(y_train)}")
    logger.info(f"y_test unique labels: {set(y_test)}")
    logger.info(f"Encoder classes: {encoder.classes_}")
    logger.info(f"Test labels not in encoder: {set(y_test) - set(encoder.classes_)}")
    # Perform GridSearchCV with RandomForest
    #cv_folds = max(2, min(5, y_train.nunique())) # Handle small datasets with few unique labels
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Log best parameters found by GridSearchCV
    logger.info("Best Parameters: %s", grid_search.best_params_)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Save the model to the specified directory
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(best_model, model_path)
    joblib.dump(scaler, os.path.join(args.model_dir, "scaler.joblib"))
    joblib.dump(encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
    

    logger.info("Model, scaler, and encoder saved.")
    logger.info("Model persisted at %s", model_path)

    # Predictions and evaluation
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log evaluation metrics
    logger.info(f"Accuracy: {accuracy * 100:.2f}%")
    logger.info(f"F1 Score: {f1:.2f}")
    try:
        roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
        logger.info(f"ROC AUC: {roc_auc:.2f}")
    except ValueError:
        logger.warning("ROC AUC unavailable — possibly a multi-class problem")


Overwriting script.py


In [5]:
# Importing sagemaker's default SKLearn library
from sagemaker.sklearn.estimator import SKLearn
FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    # created above
    entry_point="script.py",

    # ARN of a new sagemaker role (ARN of new user does not work)
    role="arn:aws:iam::324037300355:role/service-role/AmazonSageMaker-ExecutionRole-20250320T095424",
    
    # creates instance inside the Sagemaker machine
    instance_count=1,
    instance_type="ml.m5.large",

    # framework version present in the documentation, declared above
    framework_version=FRAMEWORK_VERSION,

    # name of folder after model has been trained
    base_job_name="RF-custom-model",

    # hyperparameters to the RF classifier
    hyperparameters={
        'n_estimators': 100,
        'max_depth': 20,
        'min_samples_split': 2,
        'min_samples_leaf': 2,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600,

    subnets=["subnet-0a83a3f68f72be3d6","subnet-0b8e0d7f84cbc825d","subnet-025822b6f58988d86"],  # Replace with your subnet ID(s)
    security_group_ids=["sg-03d1cf205a3d1fc89"]  # Replace with your security group ID(s)
)

In [6]:
from sagemaker.inputs import TrainingInput
# Launch the training joQ!b as an asynchronous call- begin creating an instance in the Sagemaker and start training
#sklearn_estimator.fit({"train": X_trainpath, "test": X_testpath}, wait=True)

from sagemaker.inputs import TrainingInput

# Define S3 paths
train_data_dir = 's3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/train/'
test_data_dir = 's3://ec2-utilization-sagemaker-model/sagemaker/mobile_price_classification/sklearncontainer/test/'

# Now, pass the path to the directories, not individual files
train_data = {
    'train': TrainingInput(train_data_dir, content_type='csv'),
    'test': TrainingInput(test_data_dir, content_type='csv')
}

# Train the model
sklearn_estimator.fit(train_data, wait=True)


2025-04-29 15:05:58 Starting - Starting the training job...
2025-04-29 15:06:14 Starting - Preparing the instances for training...
2025-04-29 15:07:02 Downloading - Downloading the training image......
2025-04-29 15:07:48 Training - Training image download completed. Training in progress.[34m2025-04-29 15:07:52,092 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-04-29 15:07:52,096 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-29 15:07:52,146 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-04-29 15:07:52,392 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-29 15:07:52,406 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-29 15:07:52,419 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-29 15

In [8]:
# Print some more information about the trained model
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

# Prints the exact location of the model in the S3 bucket
print("Model artifact persisted at artifact:" + artifact)


2025-04-29 15:08:26 Starting - Preparing the instances for training
2025-04-29 15:08:26 Downloading - Downloading the training image
2025-04-29 15:08:26 Training - Training image download completed. Training in progress.
2025-04-29 15:08:26 Uploading - Uploading generated training model
2025-04-29 15:08:26 Completed - Training job completed
Model artifact persisted at artifact:s3://sagemaker-us-east-1-324037300355/RF-custom-model-2025-04-29-15-05-53-604/output/model.tar.gz


In [9]:
# create a copy of the trained model which can be used to deploy
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

# identify the new location of the model
model_name = "RF-custom-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::324037300355:role/service-role/AmazonSageMaker-ExecutionRole-20250320T095424",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [10]:
model_name

'RF-custom-model-2025-04-29-15-11-14'

In [11]:
endpoint_name = "RF-custom-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,

    # deploy in this specific instance as an endpoint
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
)

EndpointName=RF-custom-model-2025-04-29-15-58-13


--------!

In [239]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x149b436d0>

In [322]:
import json
import boto3

# Define new data (ensure numerical inputs only)
new_data = {
    "CPUUtilization": 90,
    "DiskReadOps": 9000,
    "DiskWriteOps": 90000,
    "NetworkIn": 7000,
    "NetworkOut": 9000
}

# Convert data to JSON payload — now ensuring it's a list of values
payload = json.dumps([list(new_data.values())])

# Setup the SageMaker runtime client
sagemaker_runtime = boto3.client("sagemaker-runtime")

# Send the data to SageMaker endpoint
endpoint_name = "RF-custom-model-2025-03-23-23-25-24"  # Replace with your actual endpoint name

try:
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Body=payload
    )

    # Parse the response
    result = json.loads(response["Body"].read().decode())

    # Extract prediction from response
    if isinstance(result, dict):
        predicted_type = result.get("predicted_instance_type") or result.get("prediction") or list(result.values())[0]
    else:
        predicted_type = result


    # CPU-based recommendation logic
    cpu_utilization = new_data['CPUUtilization']

    if cpu_utilization < 20:
        print(f"Predicted InstanceType: {predicted_type}")
        print("Recommendation: Scale down to a smaller instance to reduce costs.")
    elif cpu_utilization > 80:
        print(f"Predicted InstanceType: {predicted_type}")
        print("Recommendation: Scale up to a larger instance type for better performance.")
    else:
        print("CPU utilization is within an optimal range. No action needed.")

except Exception as e:
    print(f"Error during prediction: {e}")


Predicted InstanceType: ['t3.medium']
Recommendation: Scale up to a larger instance type for better performance.
