## Installing depencancies

In [1]:
!pip install boto3 pandas



## Importing and Initialization of constants

In [2]:
import boto3
import pandas as pd
import os
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


  import scipy.sparse


In [3]:
# Initialize the S3 client
s3 = boto3.client('s3')

**NOTE**: Change the bucket_name

In [4]:
# S3 bucket, region, session
bucket_name = 'data-bucket-axhq3rp8'
my_region = boto3.session.Session().region_name
sess = boto3.session.Session()
print("Region is " + my_region + " and bucket is " + bucket_name)

Region is eu-west-3 and bucket is data-bucket-axhq3rp8


## Data Import

In [5]:
#key (file path within the bucket)
file_key = 'final_dataset.csv'

# Local path to temporarily save the file
local_file_path = 'local_file.csv'

# Download the file from S3
s3.download_file(bucket_name, file_key, 'local_file.csv')

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('local_file.csv')

# Remove the local file after loading it
os.remove(local_file_path)

# df = df.head(1000) ## TESTING WITH SMALL DATA

In [6]:
##CHECK df.shape 
##CHECK df.info()

## Show the DataSets

In [7]:
df = df.drop(columns = ["BlockId","Features","TimeInterval"] )
df.head(5)

Unnamed: 0,Latency,Label,E1,E2,E3,E4,E5,E6,E7,E8,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
0,21015,Success,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
1,35833,Success,0,2,1,2,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
2,53457,Success,0,0,4,2,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
3,7208,Success,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
4,871,Success,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0


## Data OutPut Path

In [8]:
# Set an output path where the trained model will be saved
prefix = 'pretrained-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://data-bucket-axhq3rp8/pretrained-algo/output


## Making Data ML Ready

In [9]:
# Determine the number of rows for the 80% split
split_index = int(len(df) * 0.8)

# Extract and shuffle the top 80% of rows
_80_df = df.iloc[:split_index].sample(frac=1, random_state=442).reset_index(drop=True)

# Extract and shuffle the bottom 20% of rows
_20_df = df.iloc[split_index:].sample(frac=1, random_state=352).reset_index(drop=True)

# Concatenate the shuffled DataFrames
final_df = pd.concat([_80_df, _20_df], ignore_index=True)

In [10]:
df = final_df
df.head(5)

Unnamed: 0,Latency,Label,E1,E2,E3,E4,E5,E6,E7,E8,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
0,38,Success,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,3,0,0,0
1,9919,Success,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
2,53317,Success,0,0,4,2,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
3,3426,Success,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
4,44,Success,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,3,0,0,0


In [11]:
import numpy as np

# Split the data into 80% for training and 20% for testing
train_data, test_data = np.split(df, [int(0.8 * len(df))])

print(train_data.shape, test_data.shape)


(80000, 31) (20000, 31)


  return bound(*args, **kwds)


### Training Data

In [12]:
label_column = train_data['Label']
features = train_data.drop(columns=['Label'])

# To Ensure 'Label' Column is the First column in the DataFrame before Saving
train_data_final = pd.concat([label_column, features], axis=1)

In [13]:
# Mapping
train_data_final['Label'] = train_data_final['Label'].map({'Success': 1, 'Fail': 0}) 

In [14]:
train_data_final.head(5)

Unnamed: 0,Label,Latency,E1,E2,E3,E4,E5,E6,E7,E8,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
0,1,38,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,3,0,0,0
1,1,9919,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
2,1,53317,0,0,4,2,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
3,1,3426,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
4,1,44,0,0,0,0,3,0,0,0,...,0,0,1,0,0,0,3,0,0,0


In [15]:
# Save to CSV
train_data_final.to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
# Create an s3_input for the training data
s3_input_train = s3_input(s3_data='s3://{}/{}/train/train.csv'.format(bucket_name, prefix), content_type='text/csv')

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Testing Data

In [16]:
label_column = test_data['Label']
features = test_data.drop(columns=['Label'])

test_data_final = pd.concat([label_column, features], axis=1)

In [17]:
# Mapping
test_data_final['Label'] = test_data_final['Label'].map({'Success': 1, 'Fail': 0})  # Map your labels to 1 and 0


In [18]:
##CHECK test_data_final.shape
test_data_final.head(5)

Unnamed: 0,Label,Latency,E1,E2,E3,E4,E5,E6,E7,E8,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
80000,1,21455,0,1,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
80001,1,8507,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
80002,1,33681,0,0,8,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
80003,1,5312,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0
80004,1,2216,0,0,0,0,3,0,0,0,...,0,3,1,3,0,0,3,0,0,0


In [19]:
# Save to CSV
test_data_final.to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
# Create an s3_input for the \testing data
s3_input_test = s3_input(s3_data='s3://{}/{}/test/test.csv'.format(bucket_name, prefix), content_type='text/csv')

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## SageMaker SDK/ Inbuild Container 

### Calling Inbuilt Image URI

In [20]:
# Looks for the XGBoost image URI and builds an XGBoost container. Specify the repo_version depending on preference.
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


This constructs the Docker image URI for XGBoost in the specified AWS region and version.

### Initialize hyperparameters

In [21]:
hyperparameters = {
        "max_depth":"5",                ## Maximum depth of a tree. Higher means more complex models but risk of overfitting.
        "eta":"0.2",                    ## Learning rate. Lower values make the learning process slower but more precise.
        "gamma":"4",                    ## Minimum loss reduction required to make a further partition on a leaf node. Controls the model’s complexity.
        "min_child_weight":"6",         ## Minimum sum of instance weight (hessian) needed in a child. Higher values prevent overfitting.
        "subsample":"0.7",              ## Fraction of training data used. Reduces overfitting by sampling part of the data. 
        "objective":"binary:logistic",  ## Specifies the learning task and corresponding objective. binary:logistic is for binary classification.
        "num_round":50                  ## Number of boosting rounds, essentially how many times the model is trained.
        }

In [22]:
# A SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container,                  # Points to the XGBoost container we previously set up. This tells SageMaker which algorithm container to use.
                                          hyperparameters=hyperparameters,      # Passes the defined hyperparameters to the estimator. These are the settings that guide the training process.
                                          role=sagemaker.get_execution_role(),  # Specifies the IAM role that SageMaker assumes during the training job. This role allows access to AWS resources like S3.
                                          train_instance_count=1,               # Sets the number of training instances. Here, it’s using a single instance.
                                          train_instance_type='ml.m5.large',    # Specifies the type of instance to use for training. ml.m5.2xlarge is a general-purpose instance with a balance of compute, memory, and network resources.
                                          train_volume_size=5, # 5GB            # Sets the size of the storage volume attached to the training instance, in GB. Here, it’s 5 GB.
                                          output_path=output_path,              # Defines where the model artifacts and output of the training job will be saved in S3.
                                          train_use_spot_instances=True,        # Utilizes spot instances for training, which can be significantly cheaper than on-demand instances. Spot instances are spare EC2 capacity offered at a lower price.
                                          train_max_run=300,                    # Specifies the maximum runtime for the training job in seconds. Here, it's 300 seconds (5 minutes).
                                          train_max_wait=600)                   # Sets the maximum time to wait for the job to complete, including the time waiting for spot instances, in seconds. Here, it's 600 seconds (10 minutes).

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


FILE: ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/estimator.py

### Training The Model

In [23]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-10-21-07-40-14-736


2024-10-21 07:40:18 Starting - Starting the training job...
2024-10-21 07:40:32 Starting - Preparing the instances for training...
2024-10-21 07:40:55 Downloading - Downloading input data...
2024-10-21 07:41:36 Downloading - Downloading the training image.....[34m[2024-10-21 07:42:26.164 ip-10-0-141-6.eu-west-3.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Det

## Deployment as Endpoint

In [24]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-21-07-43-04-870
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-21-07-43-04-870
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-21-07-43-04-870


------!

## Validation

In [25]:
from sagemaker.serializers import CSVSerializer
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [26]:
# Drop the label column from the test data
test_data_features = test_data_final.drop(columns=['Label']).values

# Set the content type and serializer
xgb_predictor.serializer = CSVSerializer()
xgb_predictor.content_type = 'text/csv'

# Perform prediction
predictions = xgb_predictor.predict(test_data_features).decode('utf-8')

y_test = test_data_final['Label'].values

# Convert the predictions into a array
predictions_array = np.fromstring(predictions, sep=',')
print(predictions_array.shape)


(20000,)


In [27]:
# Converting predictions them to binary (0 or 1)
threshold = 0.5
binary_predictions = (predictions_array >= threshold).astype(int)


In [28]:
# Accuracy
accuracy = accuracy_score(y_test, binary_predictions)

# Precision
precision = precision_score(y_test, binary_predictions)

# Recall
recall = recall_score(y_test, binary_predictions)

# F1 Score
f1 = f1_score(y_test, binary_predictions)

# Confusion Matrix
cm = confusion_matrix(y_test, binary_predictions)

# False Positive Rate (FPR) using the confusion matrix
tn, fp, fn, tp = cm.ravel()
false_positive_rate = fp / (fp + tn)


In [29]:

# Print the metrics
print(f"Accuracy: {accuracy:.8f}")
print(f"Precision: {precision:.8f}")
print(f"Recall: {recall:.8f}")
print(f"F1 Score: {f1:.8f}")
print(f"False Positive Rate: {false_positive_rate:.8f}")


Accuracy: 0.99910000
Precision: 0.99940600
Recall: 0.99952474
F1 Score: 0.99946537
False Positive Rate: 0.00315756


## Deleting The EndPoint

In [30]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-10-21-07-43-04-870


In [31]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'WED578615H3KTESY',
   'HostId': 'fR1TP7LFBjgaLD+qSPb6uuMtNHgPmrSGTuDXErDIij2ZuepNFkHaWBgaSWeYQeoTeak+z2GFS4k=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'fR1TP7LFBjgaLD+qSPb6uuMtNHgPmrSGTuDXErDIij2ZuepNFkHaWBgaSWeYQeoTeak+z2GFS4k=',
    'x-amz-request-id': 'WED578615H3KTESY',
    'date': 'Mon, 21 Oct 2024 07:49:59 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/training_job_end.ts'},
   {'Key': 'pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/collections/000000000/worker_0_collections.json'},
   {'Key': 'pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/profiler-output/framework/training_job_end.ts'},
   {'Key': 'pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/de