In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')

# File Path (Update with your actual dataset path)
# file_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/training_testing_data/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/Training_and_Testing_Sets/Training_Dataset.csv'
test_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/Training_and_Testing_Sets/Testing_Dataset.csv'


train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

## **Analysis of the Customized Correlation Function**


#### 1. **`calculate_correlation`**:
   - **Purpose**:
     - Computes and prints the Pearson correlation between `col1` and `col2` in the given DataFrame (`df`), as well as the log-transformed correlation of `col1` with `col2`.
   - **Steps**:
     - Uses `DataFrame.corr()` to compute the Pearson correlation coefficient between the two specified columns.
     - Applies a log transformation (`np.log1p`) to `col1` and then calculates its correlation with `col2`.
     - Prints both raw and log-transformed correlations.
   - **Use Case**:
     - Useful for exploratory analysis when you want to observe how log-transformation impacts the relationship between a feature and the target.

   - **Limitations**:
     - Does not handle potential issues like invalid log transformations (e.g., `np.log1p` on non-positive values).
     - Fixed to `label` as the default `col2`, limiting flexibility for non-target correlations.
     - Only prints the results; doesn't return them for further programmatic use.

---

#### 2. **`get_correlation`**:
   - **Purpose**:
     - Computes and returns the Pearson correlation between `col1` and `col2` in the given DataFrame (`df`).
   - **Steps**:
     - Directly computes the Pearson correlation using `DataFrame.corr()`.
     - Returns the result instead of printing it.
   - **Use Case**:
     - Ideal for when you need correlation values programmatically for further analysis or comparisons.

   - **Limitations**:
     - Unlike `calculate_correlation`, it does not compute the log-transformed correlation, which could provide additional insights.


In [None]:
def calculate_correlation(col1, col2='label', df=train):

  """
  Calculate the correlation between two given features. Also calculate the correlation of col1
  """

  corr = df[[col1, col2]].corr().iloc[0,1]
  log_corr = df[col1].apply(np.log1p).corr(df[col2])
  print(f"Correlation: {corr}\nlog_Correlation: {log_corr}")

def get_correlation(col1, col2='label', df=train):
  """
  Calculate and return the correlation between two features.
  """

  return df[[col1, col2]].corr().iloc[0,1]



In [None]:
numeric_columns = train.select_dtypes(include=[np.number])
corr_matrix = numeric_columns.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)

saved_dict = {}

saved_dict['corr_col'] =  to_drop


train.drop(columns=to_drop, inplace=True)
# test = test.drop(columns=to_drop)

print(train.shape, test.shape)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']
(1778032, 41) (762015, 49)


## **Adding New Features**

In [None]:
if 'network_bytes' not in train.columns:
  train['network_bytes'] = train['sbytes'] + train['dbytes']
print(train.shape, test.shape)

(1778032, 42) (762015, 49)


In [None]:
train.drop(['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], axis=1, inplace=True)

In [None]:
# To use during test data transformation
saved_dict['to_drop'] = ['srcip', 'sport', 'dstip', 'dsport', 'attack_cat']

In [None]:
train.shape, test.shape

((1778032, 37), (762015, 49))

## **Implementing Log Scales On Numerical Columns**

In [None]:
col_unique_values = train.nunique()
col = col_unique_values[col_unique_values>200].index

col

Index(['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb',
       'smeansz', 'dmeansz', 'res_bdy_len', 'sjit', 'djit', 'stime', 'sintpkt',
       'dintpkt', 'tcprtt', 'synack', 'ackdat', 'network_bytes'],
      dtype='object')

In [None]:
for column in col:
  print("{:-^30}".format(column))
  calculate_correlation(column)

-------------dur--------------
Correlation: 0.0019274028701131475
log_Correlation: -0.032544137564606564
------------sbytes------------
Correlation: 0.010344749695229565
log_Correlation: -0.3561631555898454
------------dbytes------------
Correlation: -0.07641408324436148
log_Correlation: -0.5193868283741543
------------sload-------------
Correlation: 0.19211948100086756
log_Correlation: 0.34746601450349657
------------dload-------------
Correlation: -0.21978094390126515
log_Correlation: -0.6033545881626403
------------spkts-------------
Correlation: -0.12200425437154418
log_Correlation: -0.3163533826967569
------------stcpb-------------
Correlation: -0.23365153315010911
log_Correlation: -0.31355632221428964
------------dtcpb-------------
Correlation: -0.23346071773809843
log_Correlation: -0.31340064798121187
-----------smeansz------------
Correlation: -0.06517990378993671
log_Correlation: -0.15111450989648462
-----------dmeansz------------
Correlation: -0.27230605607442226
log_Correlat

- **Positive Correlations**:
  - Features like `stime`, `sload`, `tcprtt`, `synack`, and `ackdat` show a direct relationship with the target. This suggests:
    - These metrics can help identify legitimate network activity patterns.
    - Attack behaviors may manipulate these values to resemble normal patterns (e.g., adjusting SYN-ACK times or payload sizes).

- **Negative Correlations**:
  - Features like `dload`, `dmeansz`, `stcpb`, and `dtcpb` show an inverse relationship. This indicates:
    - Lower destination data loads, mean sizes, and TCP buffer-related metrics could hint at malicious activity, such as data exfiltration or anomalous network communication.
    - Advanced Persistent Threats (APTs) often exhibit minimal resource usage to avoid detection.

- **Weak Correlations**:
  - Features like `dur`, `res_bdy_len`, `sintpkt`, and `dintpkt` have minimal correlation, meaning:
    - These metrics may not significantly differentiate between benign and malicious traffic.
    - They could be less critical for advanced threat detection or require further feature engineering for utility.

- **Log-Transformed Insights**:
  - **Improved Linear Relationships**: Log transformation enhances correlation for features like `sload`, indicating better anomaly detection in logarithmic space.
  - **Reduced Outlier Impact**: Makes correlations with features like `dload` and `dmeansz` more pronounced, improving the detection of APT behaviors that involve subtle deviations.

- **Key Patterns for Advanced Threats**:
  - Temporal metrics (`stime`, `tcprtt`, `synack`) indicate timing anomalies that are often associated with attacks.
  - Asymmetric behaviors between source and destination traffic (`sload`, `dload`) may signify malicious intent, such as unidirectional data flows in attacks.
  - Features like `stcpb` and `dtcpb` reflect unusual TCP buffer activities, common in APT command-and-control communications.

In [None]:
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']


In [None]:
saved_dict['log1p_col'] = log1p_col
mode_dict = train.mode().iloc[0].to_dict()

In [None]:
def transform_log(col, df=train):
  new_column = col + '_log1p'
  df[new_column] = df[col].apply(np.log1p)
  df.drop(col, axis=1, inplace=True)

for col in log1p_col:
  transform_log(col, df=train)

In [None]:
train.shape

(1778032, 37)

In [None]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'label', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
# Making X and Y training and testing sets from the dataset

x_train, y_train = train.drop(columns=['label']), train['label']
x_test, y_test = test.drop(columns=['label']), test['label']
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1778032, 36) (1778032,)
(762015, 48) (762015,)


In [None]:
# pickle.dump(x_train, y_train)

In [None]:
category_col = ['proto', 'service', 'state']
numeric_col = list(set(x_train.columns) - set(category_col))

In [None]:
saved_dict['category_col'] = category_col
saved_dict['numeric_col'] = numeric_col

In [None]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,254,0,-,0,0,0,1421930643,33.479,...,9.2756,0.0,1.609438,0.0,0.0,3.828641,0.0,3.878042,0.0,5.187386
1,udp,INT,60,0,dns,0,0,0,1424246229,0.008,...,18.698312,0.0,1.098612,0.0,0.0,4.890349,0.0,0.0,0.0,5.57973
2,tcp,FIN,31,29,-,255,0,0,1421948071,0.372205,...,14.105347,16.314201,3.713572,20.196135,21.733479,4.174387,6.313548,0.0,3.01207,10.152883
3,tcp,FIN,31,29,ftp,255,0,0,1421971944,16.14474,...,10.258074,10.501435,3.970292,21.803017,20.49442,4.043051,4.248495,7.264606,3.984562,8.806124
4,tcp,FIN,31,29,-,255,0,0,1421963050,1.2188,...,13.339317,13.412088,2.833213,20.673269,21.855078,4.574711,4.521789,4.309533,1.138118,8.066208


In [None]:
x_train['service'].isnull().sum()

0

In [None]:

# Define the file path
file_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/Training_and_Testing_Sets/'

# Saving the train and test datasets
pickle.dump((x_train, y_train), open(file_path + 'final_train.pkl', 'wb'))
pickle.dump((x_test, y_test), open(file_path + 'final_test.pkl', 'wb'))


## **Standardizing the Data**

Standardization is a cornerstone of data preprocessing, particularly for models sensitive to feature scales. Empirical studies demonstrate its significant impact on model performance, convergence speed, and interpretability. However, care must be taken to handle outliers and evaluate its necessity based on the specific algorithm and dataset.

### **Key Observations**:

- **Standardization Impact**:
  - After scaling, the transformed dataset (`x_train[numeric_col]`) has its features centered around 0 with a standard deviation of 1.
  - Example:
    - **Before scaling**: Raw values like `dttl`, `sbytes_log1p`, etc., could range widely.
    - **After scaling**: Values are compressed to a standardized scale, as evident from the transformed values (e.g., `-0.717760`, `-1.052109`).

- **Effect on Outliers**:
  - Standard scaling does not remove outliers but reduces their impact by rescaling all features to the same scale.

- **Scalability**:
  - This approach works seamlessly on a large dataset because it uses column-wise operations and is memory-efficient.


### **Potential Limitations**:
1. **Outlier Sensitivity**:
   - Standard scaling can be influenced by extreme outliers because it uses the mean and standard deviation of the data.
   - Alternative scalers (e.g., RobustScaler) might be more effective in cases with significant outliers.

2. **Dependency on Training Data**:
   - The `scaler` is fit on the training data, so the mean and standard deviation of the training data might not perfectly represent the test or unseen data.


In [None]:
x_train[numeric_col] = x_train[numeric_col].replace(' ', np.nan)  # empty strings or spaces in numeric columns with NaN
x_train[numeric_col] = x_train[numeric_col].astype(float) # numeric columns to float (required for StandardScaler)
x_train[numeric_col] = x_train[numeric_col].fillna(x_train[numeric_col].mean())

scaler = StandardScaler()
scaler = scaler.fit(x_train[numeric_col])
x_train[numeric_col] = scaler.transform(x_train[numeric_col])

print(x_train[numeric_col].head())


       dttl  sbytes_log1p  ct_srv_src      swin  dmeansz_log1p  sload_log1p  \
0 -0.717760     -1.052109   -0.665012 -1.196045      -1.850553    -1.383776   
1 -0.717760     -0.813189    0.996846 -1.196045      -1.850553     1.614925   
2 -0.041365      0.563887    0.812195  0.836096       0.881301     0.153252   
3 -0.041365      0.651189   -0.757337  0.836096      -0.012242    -1.071111   
4 -0.041365      0.258853   -0.111059  0.836096       0.106012    -0.090531   

       sttl    ct_ftp_cmd  stcpb_log1p  is_sm_ips_ports  ...  trans_depth  \
0  2.561444 -2.576847e-01    -1.190007         -0.04053  ...    -0.225343   
1 -0.037542 -1.146164e-16    -1.190007         -0.04053  ...    -0.225343   
2 -0.426051 -2.576847e-01     0.741629         -0.04053  ...    -0.225343   
3 -0.426051  5.248303e+00     0.895318         -0.04053  ...    -0.225343   
4 -0.426051 -2.576847e-01     0.787264         -0.04053  ...    -0.225343   

     synack  ct_src_ltm  ct_dst_ltm  dbytes_log1p  ct_state_tt

In [None]:
x_train.head()


Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,2.561444,-0.71776,-,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,...,-1.383776,-1.879995,-0.704801,-1.190007,-1.189697,-1.137341,-1.850553,0.20427,-0.899657,-1.229918
1,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,...,1.614925,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092
2,tcp,FIN,-0.426051,-0.041365,-,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,...,0.153252,0.988351,0.843701,0.741629,0.888925,-0.582651,0.881301,-0.995343,0.170283,0.894089
3,tcp,FIN,-0.426051,-0.041365,ftp,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,...,-1.071111,-0.033644,1.03263,0.895318,0.77042,-0.793357,-0.012242,1.251851,0.51573,0.318009
4,tcp,FIN,-0.426051,-0.041365,-,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,...,-0.090531,0.478104,0.195816,0.787264,0.900555,0.059601,0.106012,0.337745,-0.495378,0.001507


## **One Hot Encoding**

In [None]:

service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

for col, ohe in zip(['proto','service', 'state'], [ohe_proto, ohe_service, ohe_state]):
  x = ohe.transform(x_train[col].values.reshape(-1,1))
  tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
  x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [None]:
x_train.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,2.561444,-0.71776,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
file_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/feature_engineering/'


In [None]:
x_train.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,2.561444,-0.71776,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
file_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/Training_and_Testing_Sets/'


In [None]:
pickle.dump(scaler, open(file_path+'scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters
pickle.dump(mode_dict, open(file_path+'mode_dict.pkl', 'wb'))  #

# Onehot encoder for categorical columns
pickle.dump(ohe_proto, open(file_path+'ohe_proto.pkl', 'wb'))
pickle.dump(ohe_service, open(file_path+'ohe_service.pkl', 'wb'))
pickle.dump(ohe_state, open(file_path+'ohe_state.pkl', 'wb'))

# Cleaned and processed train data
pickle.dump((x_train, y_train), open(file_path+'final_train.pkl', 'wb'))

## **Creating Pipeline Function for Parameters**

### **1. `clean_data(data)`**
#### Purpose:
- Cleans and standardizes raw input data by:
  - Handling missing (`NaN`) and invalid values (e.g., spaces, dashes).
  - Fixing column datatypes (e.g., converting columns to floats where required).
  - Ensuring binary columns (values 0 and 1) are correctly fixed if they contain erroneous values.

#### Justification:
- Raw datasets often have incomplete or incorrect data due to system errors or human input mistakes.
- Cleaning ensures that the model receives consistent and error-free data, preventing issues during training or inference.

Key operations:
- **Filling missing values**: Replaces missing/invalid values with the mode (most frequent value).
- **Standardizing binary columns**: Ensures binary columns remain within [0, 1] values.
- **Converting dtypes**: Fixes datatype mismatches to align with downstream processing.

---

### **2. `apply_log1p(data)`**
#### Purpose:
- Applies the natural logarithm transformation (`log1p`) to certain columns for normalization and scaling purposes.

#### Justification:
- In datasets where numerical features have skewed distributions, applying `log1p` (logarithm of 1 + value) reduces skewness and brings values closer to a Gaussian (normal) distribution, improving model performance.

**Key operations:**
- Handles invalid or negative values (logarithms are undefined for these) to ensure smooth execution.
- Creates new transformed columns (e.g., `feature_log1p`) and drops the original ones for simplicity.

---

### **3. `standardize(data)`**
#### Purpose:
- Standardizes numerical features using a pre-trained `StandardScaler`.

#### Justification:
- Standardization transforms numerical features to have a mean of 0 and standard deviation of 1, which is essential for many machine learning algorithms (e.g., linear regression, SVMs, neural networks).
- Ensures consistent scaling across features, avoiding bias in optimization algorithms due to differences in feature magnitudes.

**Key operations:**
- Replaces non-numeric entries with `NaN` and imputes missing values with the column mean.
- Applies the `scaler.transform` to ensure consistency between train and test data.

---

### **4. `ohencoding(data)`**
#### Purpose:
- One-hot encodes categorical columns (`proto`, `service`, `state`) using pre-trained encoders.

#### Justification:
- Machine learning algorithms require numeric inputs; categorical features need to be converted into a numeric format.
- One-hot encoding represents categories as binary columns, allowing models to differentiate them without imposing ordinal relationships.

**Key operations:**
- Uses pre-trained `OneHotEncoder` instances to ensure consistent mapping across training and test datasets.
- Drops the original categorical columns after encoding.


In [None]:
def clean_data(data):
    '''
    Cleans given raw data. Performs various cleaning, removes Null and wrong values.
    Check for columns datatype and fix them.
    '''
    numerical_col = data.select_dtypes(include=np.number).columns  # All the numerical columns list
    categorical_col = data.select_dtypes(exclude=np.number).columns  # All the categorical columns list

    # Cleaning the data
    for col in data.columns:
        val = mode_dict[col]  # Mode value of the column in train data
        data[col] = data[col].fillna(value=val)
        data[col] = data[col].replace(' ', value=val)
        data[col] = data[col].apply(lambda x:"None" if x=="-" else x)

        # Fixing binary columns
        if col in saved_dict['binary_col']:
            data[col] = np.where(data[col]>1, val, data[col])

    # Fixing datatype of columns
    bad_dtypes = list(set(categorical_col) - set(saved_dict['category_col']))
    for bad_col in bad_dtypes:
        data[col] = data[col].astype(float)

    return data

def apply_log1p(data):

  for col in saved_dict['log1p_col']:
    new_col = col + '_log1p'
    # Handle potential invalid values before applying log1p
    data[new_col] = data[col].apply(lambda x: np.log1p(x) if pd.to_numeric(x, errors='coerce') > 0 else np.nan)
    data.drop(col, axis=1, inplace=True)

  return data

def standardize(data):
  """
  Using pretrianed StandardScaler and cleans non-numeric entries before applying this
  """
  for col in saved_dict['numeric_col']:
    if col in data.columns:
      data[col] = data[col].replace(' ', np.nan)
      data[col] = pd.to_numeric(data[col], errors='coerce')
      data[col] = data[col].fillna(data[col].mean())

  data[saved_dict['numeric_col']] = scaler.transform(data[saved_dict['numeric_col']])
  return data

def ohencoding(data):

  """
  This will encode categorical columns and updates the dataset by removing the original columns.
  Assumes encoders are trained on training data
  """

  # Apply encoders
  X_proto = ohe_proto.transform(data[['proto']])
  X_service = ohe_service.transform(data[['service']])
  X_state = ohe_state.transform(data[['state']])

  data = pd.concat([
      data,
      pd.DataFrame(X_proto, columns=[f"proto_{cat}" for cat in ohe_proto.categories_[0]]),
      pd.DataFrame(X_service, columns=[f"service_{cat}" for cat in ohe_service.categories_[0]]),
      pd.DataFrame(X_state, columns=[f"state_{cat}" for cat in ohe_state.categories_[0]])
    ], axis=1)


  data.drop(['proto', 'service', 'state'], axis=1, inplace=True)

  return data



## **Savings The Pickle Files After Feature Engineering**

In [None]:
file_path = '/content/drive/MyDrive/Advanced _Threat_Detection/UNSW-NB15/CSV_Files/Training_and_Testing_Sets/'


In [None]:
# Parametrs
saved_dict = pickle.load(open(file_path+'saved_dict.pkl', 'rb'))
mode_dict = pickle.load(open(file_path+'mode_dict.pkl', 'rb'))
scaler = pickle.load(open(file_path+'scaler.pkl', 'rb'))

In [None]:
ohe_proto = pickle.load(open(file_path+'ohe_proto.pkl', 'rb'))
ohe_service = pickle.load(open(file_path+'ohe_service.pkl', 'rb'))
ohe_state = pickle.load(open(file_path+'ohe_state.pkl', 'rb'))

In [None]:
x_test.shape

(762015, 48)

In [None]:
x_test.reset_index(drop=True, inplace=True)

In [None]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat'],
      dtype='object')

In [None]:
saved_dict['columns'] = x_test.columns.tolist()

In [None]:
saved_dict['columns']

['srcip',
 'sport',
 'dstip',
 'dsport',
 'proto',
 'state',
 'dur',
 'sbytes',
 'dbytes',
 'sttl',
 'dttl',
 'sloss',
 'dloss',
 'service',
 'sload',
 'dload',
 'spkts',
 'dpkts',
 'swin',
 'dwin',
 'stcpb',
 'dtcpb',
 'smeansz',
 'dmeansz',
 'trans_depth',
 'res_bdy_len',
 'sjit',
 'djit',
 'stime',
 'ltime',
 'sintpkt',
 'dintpkt',
 'tcprtt',
 'synack',
 'ackdat',
 'is_sm_ips_ports',
 'ct_state_ttl',
 'ct_flw_http_mthd',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_srv_src',
 'ct_srv_dst',
 'ct_dst_ltm',
 'ct_src_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'attack_cat']

## **Creating New Feature: Network Bytes**

In cybersecurity, Network Bytes (including `source bytes` and `destination` bytes) are crucial for several reasons:

**Traffic Analysis:** By examining the bytes transferred between a source and destination, analysts can identify patterns of normal and abnormal traffic. This helps in detecting potential security threats or anomalies.

**Intrusion Detection:** `Network Intrusion Detection Systems` (NIDS) use metrics like source bytes and destination bytes to identify suspicious activities. For example, an unusually high amount of data being sent to an unknown destination might indicate a data exfiltration attempt.

**Performance Monitoring:** Monitoring network bytes helps in assessing the performance and health of the network. It can reveal issues like network congestion or bottlenecks, which might affect overall system performance.

**Forensic Analysis:** In the event of a security breach, analyzing network bytes can help forensic investigators trace the source and extent of the attack, providing valuable insights into how the breach occurred and what data might have been compromised.

In [None]:
x_test['network_bytes'] = x_test['dbytes'] + x_test['sbytes']

In [None]:
dropable_col = [col for col in (saved_dict['to_drop'] + saved_dict['corr_col']) if col in x_test.columns]
x_test.drop(columns=dropable_col, inplace=True)

In [None]:
x_test.shape

(762015, 36)

In [None]:
# Identify binary columns in the training data
binary_columns = [col for col in x_test.columns if x_test[col].nunique() == 2]
saved_dict['binary_col'] = binary_columns

In [None]:
# Cleaning data using clean_data()
x_test = clean_data(x_test)

# FE: applying log1p using apply_log1p()
x_test = apply_log1p(x_test)

In [None]:
x_test.shape

(762015, 36)

In [None]:
x_test.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'dur_log1p', 'sbytes_log1p', 'dbytes_log1p',
       'sload_log1p', 'dload_log1p', 'spkts_log1p', 'stcpb_log1p',
       'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p', 'sjit_log1p',
       'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [None]:
for col in x_test.columns:
  if x_test[col].dtypes == 'object':
    print(f"Column '{col}' has non-numeric values: {x_test[col].unique()}")

x_test.replace(' ', np.nan, inplace=True)

for col in saved_dict['numeric_col']:
  if col in x_test.columns:
    x_test[col] = pd.to_numeric(x_test[col], errors='coerce')
    x_test[col] = x_test[col].fillna(x_test[col].mean())

Column 'proto' has non-numeric values: ['udp' 'tcp' 'unas' 'arp' 'argus' 'pri-enc' 'sm' 'irtp' 'ib' 'crudp'
 'ipx-n-ip' 'uti' 'mobile' 'qnx' 'gre' 'snp' 'encap' 'ospf' 'sprite-rpc'
 'stp' 'ipip' 'ippc' 'swipe' 'wsn' '3pc' 'micp' 'sat-mon' 'igp' 'rsvp'
 'i-nlsp' 'nsfnet-igp' 'mux' 'wb-mon' 'srp' 'rvd' 'sdrp' 'ifmp' 'xtp'
 'cpnx' 'any' 'ipv6-no' 'rdp' 'sps' 'sctp' 'sun-nd' 'smp' 'visa'
 'secure-vmtp' 'isis' 'pgm' 'bna' 'ggp' 'l2tp' 'ipnip' 'etherip' 'gmtp'
 'pim' 'ipv6-frag' 'eigrp' 'skip' 'mfe-nsp' 'icmp' 'merit-inp' 'tcf'
 'zero' 'pnni' 'iso-tp4' 'compaq-peer' 'fire' 'ipv6' 'narp' 'idrp'
 'ipv6-route' 'vmtp' 'idpr-cmtp' 'nvp' 'leaf-1' 'mtp' 'ipcomp' 'tlsp' 'il'
 'fc' 'ax.25' 'xns-idp' 'pipe' 'tp++' 'prm' 'trunk-2' 'ptp' 'st2'
 'sat-expak' 'wb-expak' 'ipv6-opts' 'egp' 'bbn-rcc' 'dgp' 'sccopmce' 'a/n'
 'trunk-1' 'cbt' 'vrrp' 'cphb' 'iso-ip' 'br-sat-mon' 'ttp' 'ddx' 'dcn'
 'ipcv' 'emcon' 'igmp' 'kryptolan' 'pup' 'pvp' 'cftp' 'sep' 'iatp' 'aris'
 'ip' 'aes-sp3-d' 'chaos' 'vines' 'crtp' 'ip

In [None]:
# Apply standardization using the pre-fitted scaler
x_test[saved_dict['numeric_col']] = scaler.transform(x_test[saved_dict['numeric_col']])


In [None]:
x_test = standardize(x_test)


In [None]:
for col in saved_dict['numeric_col']:
    print(f"Unique values in column {col}: {x_test[col].unique()[:5]}")  # Print the first few unique values


Unique values in column dttl: [-0.71872445 -0.73450062 -0.59741109 -0.71818044 -0.71709243]
Unique values in column sbytes_log1p: [-4.92472524 -4.70619741 -3.70704914 -4.27706164 -3.11475403]
Unique values in column ct_srv_src: [-0.91106011 -0.88548815 -0.66386445 -0.86844017 -0.90253613]
Unique values in column swin: [-1.20557622 -1.18938172 -1.20316292 -1.20113067 -1.19928894]
Unique values in column dmeansz_log1p: [-1.80879689 -1.65429811 -1.7301296  -1.93445753 -1.29916085]
Unique values in column sload_log1p: [-4.37439479 -4.37152218 -3.83365172 -4.13847615 -4.38345097]
Unique values in column sttl: [-0.84706036 -0.84185558 -0.80703739 -0.84149663 -0.85262409]
Unique values in column ct_ftp_cmd: [ -1.67649389  -0.23391488  28.63941239 149.90303752 119.58713124]
Unique values in column stcpb_log1p: [-1.1100908  -1.10410587 -1.11110342 -1.16955448 -1.10708634]
Unique values in column is_sm_ips_ports: [ -1.04217306 609.70981258]
Unique values in column network_bytes_log1p: [-3.865807

In [None]:
def ohencoding(data):
    required_columns = ['proto', 'service', 'state']
    missing_columns = [col for col in required_columns if col not in data.columns]
    if missing_columns:
        print(f"Missing columns for encoding: {missing_columns}")
        return data  # Return original data if critical columns are missing

    # Apply one-hot encoding only if columns exist
    if 'proto' in data.columns:
        X_proto = ohe_proto.transform(data[['proto']])
        data = pd.concat([data, pd.DataFrame(X_proto, columns=[f"proto_{cat}" for cat in ohe_proto.categories_[0]])], axis=1)
        data.drop(['proto'], axis=1, inplace=True)

    if 'service' in data.columns:
        X_service = ohe_service.transform(data[['service']])
        data = pd.concat([data, pd.DataFrame(X_service, columns=[f"service_{cat}" for cat in ohe_service.categories_[0]])], axis=1)
        data.drop(['service'], axis=1, inplace=True)

    if 'state' in data.columns:
        X_state = ohe_state.transform(data[['state']])
        data = pd.concat([data, pd.DataFrame(X_state, columns=[f"state_{cat}" for cat in ohe_state.categories_[0]])], axis=1)
        data.drop(['state'], axis=1, inplace=True)

    return data


In [None]:
if 'proto' not in x_test.columns:
  x_test['proto'] = test['proto']

if 'proto' in x_test.columns:
  proto_encoded = ohe_proto.transform(x_test[['proto']])

else:
  print("proto is missing; skipping encoding")

print("x_test columns:", x_test.columns)

x_test columns: Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'dur_log1p', 'sbytes_log1p', 'dbytes_log1p',
       'sload_log1p', 'dload_log1p', 'spkts_log1p', 'stcpb_log1p',
       'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p', 'sjit_log1p',
       'djit_log1p', 'network_bytes_log1p'],
      dtype='object')


In [None]:
missing_cols = set(x_train.columns) - set(x_test.columns)
extra_cols = set(x_test.columns) - set(x_train.columns)

print("Columns in x_train but missing in x_test:", missing_cols)
print("Columns in x_test but missing in x_train:", extra_cols)

Columns in x_train but missing in x_test: {'proto_3pc', 'proto_ipv6', 'proto_nvp', 'proto_pup', 'proto_iplt', 'proto_swipe', 'proto_unas', 'proto_ippc', 'proto_trunk-1', 'proto_crudp', 'service_radius', 'proto_udt', 'proto_fire', 'proto_cftp', 'service_ftp-data', 'proto_ipv6-opts', 'proto_pipe', 'proto_nsfnet-igp', 'proto_ddx', 'proto_vmtp', 'proto_ptp', 'proto_ip', 'proto_i-nlsp', 'state_ECO', 'proto_tlsp', 'state_ACC', 'proto_br-sat-mon', 'proto_gre', 'service_dhcp', 'proto_visa', 'state_no', 'state_MAS', 'proto_ipcomp', 'proto_zero', 'proto_ib', 'state_ECR', 'proto_eigrp', 'proto_iso-tp4', 'proto_ospf', 'proto_ipip', 'proto_ipx-n-ip', 'proto_leaf-1', 'proto_dgp', 'state_INT', 'proto_iso-ip', 'proto_larp', 'proto_cbt', 'proto_bna', 'proto_qnx', 'service_snmp', 'proto_l2tp', 'proto_merit-inp', 'proto_prm', 'proto_pri-enc', 'proto_sdrp', 'proto_idpr-cmtp', 'proto_ifmp', 'proto_udp', 'proto_xtp', 'proto_vines', 'proto_aes-sp3-d', 'proto_iatp', 'proto_ipnip', 'state_CLO', 'proto_pim', 's

In [None]:
# Add missing columns
for col in missing_cols:
    x_test[col] = 0


In [None]:
x_test = x_test[x_train.columns]


In [None]:
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Columns match:", all(x_train.columns == x_test.columns))


x_train shape: (1778032, 197)
x_test shape: (762015, 197)
Columns match: True
