## Installing Dependancies

In [1]:
%pip install boto3 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Fetching AWS Credentials

In [2]:
# Function to load AWS credentials from terraform.tfvars
def load_terraform_vars(filepath):
    with open(filepath, 'r') as f:
        data = {}
        for line in f:
            if line.strip() and not line.startswith('#'):
                key, value = line.replace('"', '').strip().split('=')
                data[key.strip()] = value.strip()
        return data

**NOTE**: Change the region as needed

In [3]:
# Load credentials
vars = load_terraform_vars('../terraform.tfvars')
access_key = vars['access_key']
secret_key = vars['secret_key']
region = "eu-west-3"

In [4]:
from function import get_bucket_name

In [5]:
# Call the function to get bucket name
bucket_name = get_bucket_name()
#print(f"Bucket Name: {bucket_name}") ##TEST

Attempting to get Terraform output...
Raw Terraform Output: {
  "bucket_name": {
    "sensitive": false,
    "type": "string",
    "value": "data-bucket-axhq3rp8"
  },
  "pretrained_ml_instance_name": {
    "sensitive": false,
    "type": "string",
    "value": "pretrained-ml-instance"
  }
}

Parsed Outputs: {
    "bucket_name": {
        "sensitive": false,
        "type": "string",
        "value": "data-bucket-axhq3rp8"
    },
    "pretrained_ml_instance_name": {
        "sensitive": false,
        "type": "string",
        "value": "pretrained-ml-instance"
    }
}


## Fetching Data

In [6]:
import pandas as pd
import numpy as np
import glob

# Define the path to CSV files
path_eom = "../data/Event_occurance_matrix"
path_et = "../data/Event_traces"

# glob to get all CSV files
csv_files_eom = glob.glob(f"{path_eom}/*.csv")
csv_files_et = glob.glob(f"{path_et}/*.csv")

# List to hold the DFs
dfs_eom = []
dfs_et = []

In [7]:

# Loop and read each .csv file into a DF, then append
for file in csv_files_eom:
    df = pd.read_csv(file)
    dfs_eom.append(df)

for file in csv_files_et:
    df = pd.read_csv(file)
    dfs_et.append(df)

In [8]:
# Concat all DFs into a single DFs
merged_df_EOM = pd.concat(dfs_eom, ignore_index=True)
merged_df_ET = pd.concat(dfs_et, ignore_index=True)

```python
merged_df_EOM.shape
merged_df_ET.shape
```

## DATA Cleaning

In [9]:
# Select needed columns
filtered_df_EOM = merged_df_EOM[["BlockId", "Type"] + [f"E{i}" for i in range(1, 30)]]
filtered_df_ET = merged_df_ET[["BlockId", "Features", "TimeInterval", "Latency", "Label"]]


```python
filtered_df_EOM.shape  
fltered_df_ET.shape  
filtered_df_EOM.info()  
filtered_df_ET.info()
```

In [10]:
# Merge DFs on BlockId
final_df = pd.merge(filtered_df_ET, filtered_df_EOM, on="BlockId")
##CHECK final_df.info()
##CHECK final_df.shape

In [11]:
final_df = final_df.drop(columns="Type")
##CHECK final_df.head()

In [12]:
# Initialize empty DataFrames for df_fail and df_success
df_fail = pd.DataFrame(columns=final_df.columns)
df_success = pd.DataFrame(columns=final_df.columns)

# Separate rows based on 'Label'
df_fail = pd.concat([df_fail, final_df[final_df['Label'] == 'Fail']], ignore_index=True)
df_success = pd.concat([df_success, final_df[final_df['Label'] == 'Success']], ignore_index=True)

In [13]:
# Reset index for clean DataFrames
df_fail.reset_index(drop=True, inplace=True)
df_success.reset_index(drop=True, inplace=True)

``` python
print("FAIL DataFrame:")
df_fail.head(5)
df_fail.shape
print("SUCCESS DataFrame:")
df_success.head(5)
df_success.shape
```

In [14]:
# Get a random sample of 84,165 rows from df_success
df_success_sample = df_success.sample(n=84165, random_state=424)

In [15]:
# Splitting df_fail into 80% and 20%
split_index_fail = int(len(df_fail) * 0.8)
_80_df_fail = df_fail.iloc[:split_index_fail]
_20_df_fail = df_fail.iloc[split_index_fail:]

# Splitting df_success into 80% and 20%
split_index_success = int(len(df_success_sample) * 0.8)
_80_df_success = df_success_sample.iloc[:split_index_success]
_20_df_success = df_success_sample.iloc[split_index_success:]

In [16]:
# Merging 80% datasets
_80_df = pd.concat([_80_df_fail, _80_df_success], ignore_index=True)
# Merging 20% datasets
_20_df = pd.concat([_20_df_fail, _20_df_success], ignore_index=True)

In [17]:
# Shuffling 80% dataset
_80_shuffled_df = _80_df.sample(frac=1, random_state=352).reset_index(drop=True)
# Shuffling 20% dataset
_20_shuffled_df = _20_df.sample(frac=1, random_state=433).reset_index(drop=True)

In [18]:
# Combining shuffled datasets
shuffled_df = pd.concat([_80_shuffled_df, _20_shuffled_df], ignore_index=True)

##CHECK shuffled_df.head(20)

In [19]:
final_df = shuffled_df

In [20]:
## CHECk
label_counts = final_df['Label'].value_counts()
print(label_counts)


Label
Success    84165
Fail       15835
Name: count, dtype: int64


## Uploading Data Into AWS S3

In [21]:
import boto3

# Initialize the S3 client (with credentials from earlier)
s3 = boto3.client(
    's3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region
)

In [22]:
import os
import pandas as pd

#Local Path
local_file_path = '../data/final_dataset.csv'

# Save the DF to the CSV file (local machine)
final_df.to_csv(local_file_path, index=False)

# Debug: Check for successful creation
if os.path.exists(local_file_path):
    print(f"Local file {local_file_path} created successfully.")
else:
    print(f"Failed to create local file {local_file_path}.")

# Define the S3 key (name of the file in the bucket)
file_key = "final_dataset.csv"


Local file ../data/final_dataset.csv created successfully.


In [23]:
# final_df.info()

In [24]:
# Debug: Print bucket name and file key 
print(f"Bucket Name: {bucket_name}") ## Check
print(f"File Key (S3 Filename): {file_key}") ## Check


Bucket Name: data-bucket-axhq3rp8
File Key (S3 Filename): final_dataset.csv


In [25]:
# Try to upload the local CSV file to the S3 bucket
try:
    print(f"try block executing")
    s3.upload_file(
        Filename=local_file_path, 
        Bucket=bucket_name,       
        Key=file_key               # S3 file key (filename in the bucket)
    )
    print(f"Successfully uploaded {file_key} to {bucket_name}")
    
    # Delete the local file after uploading to S3
    os.remove(local_file_path)
    print(f"Local file {local_file_path} deleted after upload.")
    
except Exception as e:
    print(f"Failed to upload file: {e}")
    os.remove(local_file_path)

try block executing
Successfully uploaded final_dataset.csv to data-bucket-axhq3rp8
Local file ../data/final_dataset.csv deleted after upload.


### ONLY RUN CODE BELOW AFTER NOTEBOOK INSTANCE's `pretrained_sm.ipynb` FILE's EXECUTION IS OVER IN SAGEMAKER, BUT BEFORE LAST LINE 2 CODE CELLS

In [26]:
import boto3
import os

def download_directory(bucket_name, local_directory):
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name)
    
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                # Get the file key from S3
                file_key = obj['Key']
                # Local file path where the file will be saved
                local_file_path = os.path.join(local_directory, file_key)
                
                # Create local directories if they don't exist
                if not os.path.exists(os.path.dirname(local_file_path)):
                    os.makedirs(os.path.dirname(local_file_path))
                
                # Download the file from S3
                try:
                    s3.download_file(bucket_name, file_key, local_file_path)
                    print(f"Downloaded {file_key} to {local_file_path}")
                except Exception as e:
                    print(f"Error downloading {file_key}: {e}")

# Set the local directory where you want to download the files
local_directory = "../downloaded_bucket_content"
print(f"Bucket Name: {bucket_name}")


Bucket Name: data-bucket-axhq3rp8


In [27]:

# Call the function to download the entire bucket
try:
    download_directory(bucket_name, local_directory)
except Exception as e:
    print(f"Failed to download file: {e}")


Downloaded final_dataset.csv to ../downloaded_bucket_content\final_dataset.csv
Downloaded pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/claim.smd to ../downloaded_bucket_content\pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/claim.smd
Downloaded pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/collections/000000000/worker_0_collections.json to ../downloaded_bucket_content\pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/collections/000000000/worker_0_collections.json
Downloaded pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/events/000000000000/000000000000_worker_0.tfevents to ../downloaded_bucket_content\pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/events/000000000000/000000000000_worker_0.tfevents
Downloaded pretrained-algo/output/sagemaker-xgboost-2024-10-21-07-40-14-736/debug-output/events/0000000000