In [40]:
import numpy as np  # Import NumPy for numerical computing
import pandas as pd  # Import Pandas for data manipulation and analysis
import seaborn as sns  # Import Seaborn for statistical graphics
sns.set(style='darkgrid')  # Set the style for Seaborn plots


In [41]:
# Loading the dataset
data = pd.read_csv('sampled_data.csv')
print(type(data))  # Should output <class 'pandas.core.frame.DataFrame'>
print(data.shape)  # Should output (10000, 79)

<class 'pandas.core.frame.DataFrame'>
(200000, 12)


In [42]:
rows, cols = data.shape

print('New dimension:')
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')
print(f'Total cells: {rows * cols}')

New dimension:
Number of rows: 200000
Number of columns: 12
Total cells: 2400000


In [43]:
# Renaming the columns by removing leading/trailing whitespace
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)

In [44]:
data.columns

Index(['userAgent', 'eventName', 'awsRegion', 'eventVersion',
       'userIdentitytype', 'userIdentityaccountId', 'userIdentityprincipalId',
       'userIdentityarn', 'userIdentityaccessKeyId', 'userIdentityuserName',
       'errorCode', 'requestParametersinstanceType'],
      dtype='object')

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   userAgent                      200000 non-null  object 
 1   eventName                      200000 non-null  object 
 2   awsRegion                      200000 non-null  object 
 3   eventVersion                   200000 non-null  float64
 4   userIdentitytype               200000 non-null  object 
 5   userIdentityaccountId          200000 non-null  object 
 6   userIdentityprincipalId        200000 non-null  object 
 7   userIdentityarn                200000 non-null  object 
 8   userIdentityaccessKeyId        200000 non-null  object 
 9   userIdentityuserName           200000 non-null  object 
 10  errorCode                      155914 non-null  object 
 11  requestParametersinstanceType  200000 non-null  object 
dtypes: float64(1), object(11)
memo

In [46]:
data.head(10)

Unnamed: 0,userAgent,eventName,awsRegion,eventVersion,userIdentitytype,userIdentityaccountId,userIdentityprincipalId,userIdentityarn,userIdentityaccessKeyId,userIdentityuserName,errorCode,requestParametersinstanceType
0,Boto3,RunInstances,eu-west-3,1.05,IAMUser,812000000000.0,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,ASIAZ6XVDR2PA984G89L,Level6,Client.RequestLimitExceeded,m3.medium
1,Boto3,RunInstances,us-east-2,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,Client.RequestLimitExceeded,r5d.2xlarge
2,Boto3,RunInstances,ap-northeast-1,1.05,IAMUser,812000000000.0,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,AKIA3Z2XBVUDFQ9TU4MD,Level6,Client.RequestLimitExceeded,d2.xlarge
3,Boto3,RunInstances,ap-northeast-2,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,Client.InstanceLimitExceeded,p2.16xlarge
4,Boto3,RunInstances,ap-northeast-2,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,Client.RequestLimitExceeded,m1.large
5,Boto3,RunInstances,eu-west-1,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,Client.InstanceLimitExceeded,x1.32xlarge
6,Boto3,RunInstances,ap-south-1,1.05,IAMUser,812000000000.0,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,ASIAZ6XVDR2PA984G89L,Level6,Client.RequestLimitExceeded,p3.2xlarge
7,Boto3,RunInstances,ap-northeast-1,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,Client.RequestLimitExceeded,h1.16xlarge
8,Boto3,RunInstances,eu-west-2,1.05,IAMUser,812000000000.0,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,AKIA3Z2XBVUDFQ9TU4MD,Level6,Client.UnauthorizedOperation,r5d.4xlarge
9,Boto3,RunInstances,eu-west-1,1.05,IAMUser,812000000000.0,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,ASIARF55FBMFZBXLKDFW,backup,Client.UnauthorizedOperation,c5d.9xlarge


## preprosessing

missing

In [47]:
data[data['errorCode'].isna()]

Unnamed: 0,userAgent,eventName,awsRegion,eventVersion,userIdentitytype,userIdentityaccountId,userIdentityprincipalId,userIdentityarn,userIdentityaccessKeyId,userIdentityuserName,errorCode,requestParametersinstanceType
10,aws-cli,DescribeSnapshots,eu-west-1,1.05,IAMUser,8.12E+11,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,,NotApplicable
22,ec2.amazonaws.com,AssumeRole,us-west-2,1.05,AWSService,Unknown,Unknown,Unknown,Unknown,Unknown,,NotApplicable
25,support.amazonaws.com,AssumeRole,ap-northeast-1,1.05,AWSService,Unknown,Unknown,Unknown,Unknown,Unknown,,NotApplicable
27,Boto3,AssumeRole,us-east-1,1.05,IAMUser,8.12E+11,AIDAQ9SRE1SZZZ4Q4UOJH,arn:aws:iam::811596193553:user/SecurityMokey,AKIAXJ0III4JBKZDXGXN,SecurityMokey,,NotApplicable
28,aws-cli,ListUsers,us-east-1,1.02,IAMUser,8.12E+11,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,AKIA3Z2XBVUDFQ9TU4MD,Level6,,NotApplicable
...,...,...,...,...,...,...,...,...,...,...,...,...
199966,Boto3,ListEntitiesForPolicy,us-east-1,1.05,IAMUser,8.12E+11,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,AKIA3Z2XBVUDFQ9TU4MD,Level6,,NotApplicable
199970,Boto3,ListUserPolicies,us-east-1,1.05,IAMUser,8.12E+11,AIDADO2GQD0K8TEF7KW1V,arn:aws:iam::811596193553:user/Level6,AKIA3Z2XBVUDFQ9TU4MD,Level6,,NotApplicable
199982,aws-cli,GetCallerIdentity,us-east-1,1.05,IAMUser,8.12E+11,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,,NotApplicable
199986,aws-cli,DescribeSnapshots,us-east-1,1.05,IAMUser,8.12E+11,AIDA9BO36HFBHKGJAO9C1,arn:aws:iam::811596193553:user/backup,AKIA01U43UX3RBRDXF4Q,backup,,NotApplicable


In [48]:
missing_val = data.isna().sum()
missing_val       

userAgent                            0
eventName                            0
awsRegion                            0
eventVersion                         0
userIdentitytype                     0
userIdentityaccountId                0
userIdentityprincipalId              0
userIdentityarn                      0
userIdentityaccessKeyId              0
userIdentityuserName                 0
errorCode                        44086
requestParametersinstanceType        0
dtype: int64

columns names of the dataset
numberical: eventVersion

catigorical   
userAgent                            
eventName                            
awsRegion                            
userIdentitytype                     
userIdentityaccountId                
userIdentityprincipalId              
userIdentityarn                      
userIdentityaccessKeyId              
userIdentityuserName                 
errorCode

lable: requestParametersinstanceType 

number of unique values
userAgent                         44
eventName                        818
awsRegion                         16
eventVersion                       6
userIdentitytype                   4
userIdentityaccountId             13
userIdentityprincipalId           26
userIdentityarn                   15
userIdentityaccessKeyId          309
userIdentityuserName               6
errorCode                         73
requestParametersinstanceType      2

type of each colum

userAgent                         object
eventName                         object
awsRegion                         object
eventVersion                     float64
userIdentitytype                  object
userIdentityaccountId             object
userIdentityprincipalId           object
userIdentityarn                   object
userIdentityaccessKeyId           object
userIdentityuserName              object
errorCode                         object
requestParametersinstanceType      int64

Remove null

In [49]:
data = data.dropna() 
data.shape

(155914, 12)

In [51]:
data['requestParametersinstanceType'].unique()

array(['m3.medium', 'r5d.2xlarge', 'd2.xlarge', 'p2.16xlarge', 'm1.large',
       'x1.32xlarge', 'p3.2xlarge', 'h1.16xlarge', 'r5d.4xlarge',
       'c5d.9xlarge', 'NotApplicable', 'f1.16xlarge', 'm5d.large',
       'h1.8xlarge', 'c3.8xlarge', 'r3.4xlarge', 'x1e.8xlarge',
       'p2.8xlarge', 'z1d.3xlarge', 'm5d.8xlarge', 'r5d.12xlarge',
       'm3.2xlarge', 'p2.xlarge', 'r5.large', 'p3dn.24xlarge', 'c5.metal',
       'c3.2xlarge', 'c5d.18xlarge', 'r5d.metal', 'r4.8xlarge',
       'c3.large', 'i3.16xlarge', 'i3.4xlarge', 'm5.xlarge', 'i2.4xlarge',
       'h1.4xlarge', 'g2.2xlarge', 'm1.medium', 'z1d.12xlarge',
       'x1e.4xlarge', 'r4.2xlarge', 'm5.large', 'r3.2xlarge',
       'r5.2xlarge', 'c5.2xlarge', 'z1d.large', 'm5d.metal', 'r4.4xlarge',
       'c5d.2xlarge', 'x1e.2xlarge', 'm5d.16xlarge', 'm2.xlarge',
       'hs1.8xlarge', 'r5.xlarge', 'r3.xlarge', 'i3.metal', 'r5.metal',
       'r4.large', 'g3.16xlarge', 'r5d.xlarge', 'c5d.xlarge', 'c4.large',
       'm4.16xlarge', 'z1d.6xlarge

In [30]:
data['requestParametersinstanceType'] = data['requestParametersinstanceType'].apply(lambda x: 0 if x == 'NotApplicable' else 1)  # 0 for 'NotApplicable' (not breached), 1 for breached


In [31]:
data.nunique()

userAgent                         44
eventName                        818
awsRegion                         16
eventVersion                       6
userIdentitytype                   4
userIdentityaccountId             13
userIdentityprincipalId           26
userIdentityarn                   15
userIdentityaccessKeyId          309
userIdentityuserName               6
errorCode                         73
requestParametersinstanceType      2
dtype: int64

In [32]:
data.dtypes

userAgent                         object
eventName                         object
awsRegion                         object
eventVersion                     float64
userIdentitytype                  object
userIdentityaccountId             object
userIdentityprincipalId           object
userIdentityarn                   object
userIdentityaccessKeyId           object
userIdentityuserName              object
errorCode                         object
requestParametersinstanceType      int64
dtype: object

In [39]:
data["requestParametersinstanceType"].value_counts()

requestParametersinstanceType
1    136889
0     19025
Name: count, dtype: int64

# Now let's train the model
### split the data and prepare the test train split

In [33]:
from torch.utils.data import Dataset, DataLoader
import torch

from torch.utils.data import Dataset
import torch

class MixedDataDataset(Dataset):
    def __init__(self, numerical_data, categorical_data, labels):
        self.numerical_data = torch.tensor(numerical_data, dtype=torch.float32)
        
        # Convert categorical data to tensor
        self.categorical_data = [
            torch.tensor(categorical_data[col].values, dtype=torch.long) for col in categorical_data.columns
        ]
        
        # Ensure labels are a tensor with correct shape
        self.labels = torch.tensor(labels.values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.numerical_data)

    def __getitem__(self, idx):
        num_features = self.numerical_data[idx]
        cat_features = [self.categorical_data[i][idx] for i in range(len(self.categorical_data))]
        label = self.labels[idx]
        return num_features, cat_features, label




In [34]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Load your dataset (replace with your actual file path)
# data = pd.read_csv('your_dataset.csv')

# Separate features and label
X = data.drop('requestParametersinstanceType', axis=1)  # Drop the label column
y = data['requestParametersinstanceType']  # The binary label column

# Encode label as binary (0 or 1)
y = y.astype(int)

# Identify categorical and numerical columns
numerical_cols = ['eventVersion']
categorical_cols = [
    'userAgent', 'eventName', 'awsRegion', 'userIdentitytype', 'userIdentityaccountId',
    'userIdentityprincipalId', 'userIdentityarn', 'userIdentityaccessKeyId', 'userIdentityuserName', 'errorCode'
]

# Preprocess numerical data (Standardization)
scaler = StandardScaler()
X_num = scaler.fit_transform(X[numerical_cols])

# Preprocess categorical data (Label Encoding)
encoder = LabelEncoder()
X_cat = X[categorical_cols].apply(encoder.fit_transform)

# Train-test split (80-20 split)
X_num_train, X_num_val, X_cat_train, X_cat_val, y_train, y_val = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=42
)

# Prepare datasets and dataloaders
batch_size = 32

# Pass only the numerical data to the model, and categorical data is inferred from the dataset class
train_dataset = MixedDataDataset(X_num_train, X_cat_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MixedDataDataset(X_num_val, X_cat_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [35]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, num_input_features, categorical_data, embedding_dim=16):
        super(MyModel, self).__init__()
        
        # Calculate the number of categorical features
        num_categories_per_feature = [len(categorical_data[col].unique()) for col in categorical_data.columns]
        
        # Define layers for numerical data
        self.num_fc = nn.Sequential(
            nn.Linear(num_input_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),  # Add Batch Normalization
            nn.Dropout(0.5)
        )
        
        # Define embedding layers for categorical data
        self.cat_fc = nn.ModuleList([
            nn.Embedding(num_categories, embedding_dim) for num_categories in num_categories_per_feature
        ])
        
        self.fc_combined = nn.Sequential(
            nn.Linear(64 + 16 * len(num_categories_per_feature), 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 1)  # Output 1 logit for binary classification
        )

    def forward(self, num_data, cat_data):
        num_out = self.num_fc(num_data)
        
        # Process each categorical feature through its corresponding embedding layer
        cat_out = [embedding(cat_data[i]) for i, embedding in enumerate(self.cat_fc)]
        cat_out = torch.cat(cat_out, dim=1)  # Concatenate all categorical embeddings
        
        # Combine numerical and categorical data
        combined = torch.cat((num_out, cat_out), dim=1)
        
        # Final output
        return self.fc_combined(combined)



In [36]:
from tqdm import tqdm

# Training function with tqdm
def train(model, train_loader, criterion, optimizer, device, epoch, num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct = 0
    total = 0
    with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as tepoch:
        for num_batch, cat_batch, labels_batch in tepoch:
            # Move data to GPU
            num_batch, cat_batch, labels_batch = num_batch.to(device), [c.to(device) for c in cat_batch], labels_batch.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(num_batch, cat_batch)

            # Calculate loss
            loss = criterion(outputs, labels_batch)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Update running loss
            running_loss += loss.item()

            # Convert logits to predicted classes (binary classification)
            predictions = torch.sigmoid(outputs).round()  # Apply sigmoid and round to get binary predictions
            total += labels_batch.size(0)
            correct += (predictions == labels_batch).sum().item()

            # Update progress bar
            tepoch.set_postfix(loss=running_loss / (len(tepoch)), accuracy=100 * correct / total)

    # Print epoch statistics
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch {epoch+1} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

# Evaluation function with tqdm
def evaluate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():  # No need to calculate gradients during evaluation
        with tqdm(val_loader, desc="Evaluating", unit="batch") as tepoch:
            for num_batch, cat_batch, labels_batch in tepoch:
                # Move data to GPU
                num_batch, cat_batch, labels_batch = num_batch.to(device), [c.to(device) for c in cat_batch], labels_batch.to(device)

                # Forward pass
                outputs = model(num_batch, cat_batch)

                # Calculate loss
                loss = criterion(outputs, labels_batch)

                # Update running loss
                running_loss += loss.item()

                # Convert logits to predicted classes (binary classification)
                predictions = torch.sigmoid(outputs).round()  # Apply sigmoid and round to get binary predictions
                total += labels_batch.size(0)
                correct += (predictions == labels_batch).sum().item()

                # Update progress bar
                tepoch.set_postfix(loss=running_loss / len(tepoch), accuracy=100 * correct / total)

    # Print evaluation statistics
    val_loss = running_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

    return val_loss, val_accuracy


In [37]:
import torch.optim as optim

# Ensure your device is set (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model parameters
num_input_features = X_num.shape[1]  # Automatically get the number of numerical features

# Instantiate the model
model = MyModel(num_input_features, X_cat)
model.to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # For binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [38]:
# Number of epochs for training
num_epochs = 2

# Training loop
for epoch in range(num_epochs):
    train(model, train_loader, criterion, optimizer, device, epoch, num_epochs)
    
    # Evaluate after each epoch
    evaluate(model, val_loader, criterion, device)


Epoch 1/2: 100%|██████████| 3898/3898 [01:11<00:00, 54.44batch/s, accuracy=99.9, loss=0.00408]


Epoch 1 - Loss: 0.0041, Accuracy: 99.87%


Evaluating: 100%|██████████| 975/975 [00:19<00:00, 51.01batch/s, accuracy=100, loss=0.00037] 


Validation - Loss: 0.0004, Accuracy: 100.00%


Epoch 2/2: 100%|██████████| 3898/3898 [01:19<00:00, 49.02batch/s, accuracy=100, loss=0.000227]


Epoch 2 - Loss: 0.0002, Accuracy: 99.99%


Evaluating: 100%|██████████| 975/975 [00:12<00:00, 78.18batch/s, accuracy=100, loss=0.000486]

Validation - Loss: 0.0005, Accuracy: 99.99%



