In [None]:
!pip install sagemaker --upgrade

In [None]:
!pip install transformers "datasets[s3]" accelerate --upgrade

In [None]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
sess

<sagemaker.session.Session at 0x7fb7084d69a0>

In [None]:
role

'arn:aws:iam::648057559257:role/service-role/AmazonSageMaker-ExecutionRole-20230507T193773'

In [None]:
sess = sagemaker.Session(default_bucket='mlops-cdk-project-1', default_bucket_prefix='MLOps-Project')

In [None]:
import time
import pandas as pd
from datasets import load_dataset, Dataset

In [None]:
dataset = load_dataset("rdpahalavan/network-packet-flow-header-payload")

In [None]:
df = pd.DataFrame(dataset['train'])

In [None]:
df

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [None]:
train_df.to_csv('train.csv', index=False)

In [None]:
test_df.to_csv('test.csv', index=False)

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
classes = train_df['attack_cat'].unique()

target_map = {class_name: index for index, class_name in enumerate(sorted(classes))}

In [None]:
target_map

{'Analysis': 0,
 'Backdoor': 1,
 'Bot': 2,
 'DDoS': 3,
 'DoS': 4,
 'DoS GoldenEye': 5,
 'DoS Hulk': 6,
 'DoS SlowHTTPTest': 7,
 'DoS Slowloris': 8,
 'Exploits': 9,
 'FTP Patator': 10,
 'Fuzzers': 11,
 'Generic': 12,
 'Heartbleed': 13,
 'Infiltration': 14,
 'Normal': 15,
 'Port Scan': 16,
 'Reconnaissance': 17,
 'SSH Patator': 18,
 'Shellcode': 19,
 'Web Attack - Brute Force': 20,
 'Web Attack - SQL Injection': 21,
 'Web Attack - XSS': 22,
 'Worms': 23}

In [None]:
train_df['target'] = train_df['attack_cat'].map(target_map)

In [None]:
train_df = test_df[['packet_dat', 'target']]

In [None]:
train_df.columns = ['packet', 'label']

In [None]:
raw_dataset = Dataset.from_pandas(train_df)

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModel

In [None]:
checkpoint = 'distilbert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_batch(batch):
  return tokenizer(batch['packet'], truncation=True)

In [None]:
tokenized_datasets = raw_dataset.map(function=tokenize_batch, batched=True)

In [None]:
dataset_train_tokenized = tokenized_datasets.rename_column("label", "labels")
dataset_train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
dataset_val_tokenized = tokenized_datasets.rename_column("label", "labels")
dataset_val_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
from datasets.filesystems import S3FileSystem

In [None]:
s3 = S3FileSystem()

training_input_path = f's3://{sess.default_bucket()}/MLOps-Project/Dataset/train'

val_input_path = f's3://{sess.default_bucket()}/MLOps-Project/Dataset/val'

In [None]:
from sagemaker.huggingface import HuggingFace

In [None]:
huggingface_estimator = HuggingFace(
                            entry_point='train.py',
                            output_path=f's3://{sess.default_bucket()}',
                            base_job_name='huggingface-sdk-extension',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            transformers_version='4.26.0',
                            pytorch_version='1.13.1',
                            py_version='py39',
                            role=role
                        )

In [None]:
huggingface_estimator.fit({'train': training_input_path, 'test': val_input_path},
                          wait=True,
                          job_name='MLOps-Project-{}'.format(int(time.time())))

In [None]:
predictor = huggingface_estimator.deploy(initial_instance_count=1,
                                         instance_type="ml.g4dn.xlarge",
                                         endpoint_name="MLOps-Project-Endpoint-{}".format(int(time.time())))

In [None]:
predictor.predict({"inputs": train_df.iloc[1178]['packet'][:512]})

[{'label': 'Worms', 'score': 0.6542378067970276}]

In [None]:
import boto3
import json

In [None]:
sagemaker_runtime = boto3.client('sagemaker-runtime')

endpoint_name = "MLOps-Project-Endpoint-1697308604"

input_data = {"inputs": train_df.iloc[1178]['packet_dat'][:512]}

input_json = json.dumps(input_data)

response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=input_json,
    ContentType='application/json'  # Specify the content type of your input data
)

prediction = response['Body'].read().decode()

In [None]:
prediction

'[{"label":"Fuzzers","score":0.7525674104690552}]'