## Installing depencancies

In [None]:
!pip install boto3 pandas

## Importing and Initialization of constants

In [None]:
import boto3
import pandas as pd
import os
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

In [3]:
# Initialize the S3 client
s3 = boto3.client('s3')

**NOTE**: Change the bucket_name

In [None]:
# S3 bucket, region, session
bucket_name = 'data-bucket-dkmq0cxe'
my_region = boto3.session.Session().region_name
sess = boto3.session.Session()
print("Region is " + my_region + " and bucket is " + bucket_name)

## Data Import

In [5]:
#key (file path within the bucket)
file_key = 'final_dataset.csv'

# Local path to temporarily save the file
local_file_path = 'local_file.csv'

# Download the file from S3
s3.download_file(bucket_name, file_key, 'local_file.csv')

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('local_file.csv')

# Remove the local file after loading it
os.remove(local_file_path)

# df = df.head(1000) ## TESTING WITH SMALL DATA

In [6]:
##CHECK df.shape 
##CHECK df.info()

## Show the DataSets

In [None]:
df = df.drop(columns = ["BlockId","Features","TimeInterval"] )
df.head(5)

## Data OutPut Path

In [None]:
# Set an output path where the trained model will be saved
prefix = 'pretrained-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

## Making Data ML Ready

In [None]:
# Determine the number of rows for the 80% split
split_index = int(len(df) * 0.8)

# Extract and shuffle the top 80% of rows
_80_df = df.iloc[:split_index].sample(frac=1, random_state=442).reset_index(drop=True)

# Extract and shuffle the bottom 20% of rows
_20_df = df.iloc[split_index:].sample(frac=1, random_state=352).reset_index(drop=True)

# Concatenate the shuffled DataFrames
final_df = pd.concat([_80_df, _20_df], ignore_index=True)

In [None]:
df = final_df
df.head(5)

In [None]:
import numpy as np

# Split the data into 80% for training and 20% for testing
train_data, test_data = np.split(df, [int(0.8 * len(df))])

print(train_data.shape, test_data.shape)


### Training Data

In [None]:
label_column = train_data['Label']
features = train_data.drop(columns=['Label'])

# To Ensure 'Label' Column is the First column in the DataFrame before Saving
train_data_final = pd.concat([label_column, features], axis=1)

In [None]:
# Mapping
train_data_final['Label'] = train_data_final['Label'].map({'Success': 1, 'Fail': 0}) 

In [None]:
train_data_final.head(5)

In [None]:
# Save to CSV
train_data_final.to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
# Create an s3_input for the training data
s3_input_train = s3_input(s3_data='s3://{}/{}/train/train.csv'.format(bucket_name, prefix), content_type='text/csv')

### Testing Data

In [None]:
label_column = test_data['Label']
features = test_data.drop(columns=['Label'])

test_data_final = pd.concat([label_column, features], axis=1)

In [None]:
# Mapping
test_data_final['Label'] = test_data_final['Label'].map({'Success': 1, 'Fail': 0})  # Map your labels to 1 and 0


In [None]:
##CHECK test_data_final.shape
test_data_final.head(5)

In [None]:
# Save to CSV
test_data_final.to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
# Create an s3_input for the \testing data
s3_input_test = s3_input(s3_data='s3://{}/{}/test/test.csv'.format(bucket_name, prefix), content_type='text/csv')

## SageMaker SDK/ Inbuild Container 

### Callling Inbuilt Image URI

In [None]:
# Looks for the XGBoost image URI and builds an XGBoost container. Specify the repo_version depending on preference.
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

This constructs the Docker image URI for XGBoost in the specified AWS region and version.

### Initialize hyperparameters

In [19]:
hyperparameters = {
        "max_depth":"5",                ## Maximum depth of a tree. Higher means more complex models but risk of overfitting.
        "eta":"0.2",                    ## Learning rate. Lower values make the learning process slower but more precise.
        "gamma":"4",                    ## Minimum loss reduction required to make a further partition on a leaf node. Controls the model’s complexity.
        "min_child_weight":"6",         ## Minimum sum of instance weight (hessian) needed in a child. Higher values prevent overfitting.
        "subsample":"0.7",              ## Fraction of training data used. Reduces overfitting by sampling part of the data. 
        "objective":"binary:logistic",  ## Specifies the learning task and corresponding objective. binary:logistic is for binary classification.
        "num_round":50                  ## Number of boosting rounds, essentially how many times the model is trained.
        }

In [None]:
# A SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container,                  # Points to the XGBoost container we previously set up. This tells SageMaker which algorithm container to use.
                                          hyperparameters=hyperparameters,      # Passes the defined hyperparameters to the estimator. These are the settings that guide the training process.
                                          role=sagemaker.get_execution_role(),  # Specifies the IAM role that SageMaker assumes during the training job. This role allows access to AWS resources like S3.
                                          train_instance_count=1,               # Sets the number of training instances. Here, it’s using a single instance.
                                          train_instance_type='ml.m5.large',    # Specifies the type of instance to use for training. ml.m5.2xlarge is a general-purpose instance with a balance of compute, memory, and network resources.
                                          train_volume_size=5, # 5GB            # Sets the size of the storage volume attached to the training instance, in GB. Here, it’s 5 GB.
                                          output_path=output_path,              # Defines where the model artifacts and output of the training job will be saved in S3.
                                          train_use_spot_instances=True,        # Utilizes spot instances for training, which can be significantly cheaper than on-demand instances. Spot instances are spare EC2 capacity offered at a lower price.
                                          train_max_run=300,                    # Specifies the maximum runtime for the training job in seconds. Here, it's 300 seconds (5 minutes).
                                          train_max_wait=600)                   # Sets the maximum time to wait for the job to complete, including the time waiting for spot instances, in seconds. Here, it's 600 seconds (10 minutes).

FILE: ~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/estimator.py

### Training The Model

In [None]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

## Deployment as Endpoint

In [None]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.large')

## Validation

In [23]:
from sagemaker.serializers import CSVSerializer
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Drop the label column from the test data
test_data_features = test_data_final.drop(columns=['Label']).values

# Set the content type and serializer
xgb_predictor.serializer = CSVSerializer()
xgb_predictor.content_type = 'text/csv'

# Perform prediction
predictions = xgb_predictor.predict(test_data_features).decode('utf-8')

y_test = test_data_final['Label'].values

# Convert the predictions into a array
predictions_array = np.fromstring(predictions, sep=',')
print(predictions_array.shape)


In [None]:
# Converting predictions them to binary (0 or 1)
threshold = 0.5
binary_predictions = (predictions_array >= threshold).astype(int)


In [None]:
# Accuracy
accuracy = accuracy_score(y_test, binary_predictions)

# Precision
precision = precision_score(y_test, binary_predictions)

# Recall
recall = recall_score(y_test, binary_predictions)

# F1 Score
f1 = f1_score(y_test, binary_predictions)

# Confusion Matrix
cm = confusion_matrix(y_test, binary_predictions)

# False Positive Rate (FPR) using the confusion matrix
tn, fp, fn, tp = cm.ravel()
false_positive_rate = fp / (fp + tn)


In [None]:

# Print the metrics
print(f"Accuracy: {accuracy:.8f}")
print(f"Precision: {precision:.8f}")
print(f"Recall: {recall:.8f}")
print(f"F1 Score: {f1:.8f}")
print(f"False Positive Rate: {false_positive_rate:.8f}")


## Deleting The EndPoint

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()