In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datp2020sets/train.csv
/kaggle/input/datp2020sets/test.csv


# Load datasets

In [2]:
import os
import pandas as pd
# Base directory where your dataset is stored
base_directory = '/kaggle/input/datp2020sets/train.csv'


# Load all CSV files from relevant weeks
dataframes = pd.read_csv(base_directory)



activity_counts = dataframes['Activity'].value_counts()
print("Activity distribution:")
print(activity_counts)

stage_counts = dataframes['Stage'].value_counts()
print("Stage distribution:")
print(stage_counts)

Activity distribution:
Activity
Normal                  8855
Directory Bruteforce    8465
Account Bruteforce        91
SQL Injection             55
Account Discovery         12
CSRF                       7
Malware Download           2
Name: count, dtype: int64
Stage distribution:
Stage
Benign                8855
Establish Foothold    8588
Reconnaissance          44
Name: count, dtype: int64


### Data Shape

In [3]:
dataframes.shape

(17487, 85)

In [31]:
nan_count = dataframes.isnull().sum().sum()
print(nan_count)

0


### Data columns

In [4]:
dataframes.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

### Columns datatype

In [5]:
import pandas as pd

print(dataframes.dtypes.to_string())


Flow ID                        object
Src IP                         object
Src Port                        int64
Dst IP                         object
Dst Port                        int64
Protocol                        int64
Timestamp                      object
Flow Duration                   int64
Total Fwd Packet                int64
Total Bwd packets               int64
Total Length of Fwd Packet    float64
Total Length of Bwd Packet    float64
Fwd Packet Length Max         float64
Fwd Packet Length Min         float64
Fwd Packet Length Mean        float64
Fwd Packet Length Std         float64
Bwd Packet Length Max         float64
Bwd Packet Length Min         float64
Bwd Packet Length Mean        float64
Bwd Packet Length Std         float64
Flow Bytes/s                  float64
Flow Packets/s                float64
Flow IAT Mean                 float64
Flow IAT Std                  float64
Flow IAT Max                  float64
Flow IAT Min                  float64
Fwd IAT Tota

### Targeted Label.

In [6]:
Y = dataframes['Stage'].apply(lambda x: 'Non-APT' if x == 'Benign' else 'APT')

# Check the distribution of the new label
print(Y.value_counts())

Stage
Non-APT    8855
APT        8632
Name: count, dtype: int64


### Encoding Targeted Label.

In [7]:
from sklearn.preprocessing import LabelEncoder

# Sample target classes
target_classes = Y.values

# Initialize LabelEncoder
encoder = LabelEncoder()

# Fit and transform the target classes
Y = encoder.fit_transform(target_classes)

# Display the encoded target classes
print("Encoded target classes:")
for class_name, encoded_class in zip(encoder.classes_, range(len(encoder.classes_))):
    print(f"{class_name}: {encoded_class}")

# Check the transformed target classes
print("Transformed target classes:")
print(Y)

Encoded target classes:
APT: 0
Non-APT: 1
Transformed target classes:
[1 1 1 ... 0 1 0]


### Investing timestamp data.

In [22]:
dataframes["Timestamp"].nunique()

4227

### Feature Extraction with `Timestamp`

In [25]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')

# Convert the 'Timestamp' column to datetime
dataframes['Timestamp'] = pd.to_datetime(dataframes['Timestamp'], format='%d/%m/%Y %I:%M:%S %p')

# Extracting features from the datetime
dataframes['Year'] = dataframes['Timestamp'].dt.year
dataframes['Month'] = dataframes['Timestamp'].dt.month
dataframes['Day'] = dataframes['Timestamp'].dt.day
dataframes['Hour'] = dataframes['Timestamp'].dt.hour
dataframes['Minute'] = dataframes['Timestamp'].dt.minute
dataframes['Second'] = dataframes['Timestamp'].dt.second


# Selecting the columns to normalize
features_to_normalize = ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']

# # Initialize the MinMaxScaler
# scaler = MinMaxScaler()

# # Apply the scaler to the features
# df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

# # Now df contains the normalized features which can be fed into your model
# print(df.head())

dtimestamp = dataframes[features_to_normalize]
dtimestamp


Unnamed: 0,Year,Month,Day,Hour,Minute,Second
0,2019,7,17,14,35,9
1,2019,7,17,14,33,27
2,2019,7,17,14,35,41
3,2019,7,17,14,36,21
4,2019,7,17,14,35,41
...,...,...,...,...,...,...
17482,2019,7,17,15,57,25
17483,2019,7,17,18,21,41
17484,2019,7,17,15,58,15
17485,2019,7,17,16,24,43


In [26]:
dtimestamp.columns

Index(['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], dtype='object')

### IP feature conversion

In [9]:
import pandas as pd
import numpy as np
import ipaddress
# import category_encoders as ce

# Sample DataFrame for demonstration
# df_cleaned = pd.read_csv('your_dataset.csv')  # Assuming you load your data this way
non_numeric_columns = ["Src IP","Dst IP","Timestamp"]

# List of non-numeric columns

# Function to validate and convert IP to integer
def validate_and_convert_ip(ip):
    try:
        # Check if it's a valid IPv4 address
        ip = ipaddress.IPv4Address(ip)
        return int(ip) / (2**32 - 1)  # Normalize to the range [0, 1]
    except:
        # Handle both IPv6 and invalid addresses by returning NaN
        return np.nan

# Create a copy of the IP-related columns to avoid modifying the original DataFrame
df_ip = dataframes[["Src IP", "Dst IP"]].copy()

# Apply the function to IP columns
df_ip['Src IP'] = df_ip['Src IP'].apply(validate_and_convert_ip)
df_ip['Dst IP'] = df_ip['Dst IP'].apply(validate_and_convert_ip)

# Handle NaN values that may have resulted from invalid IPs
df_ip['Src IP'].fillna(-1, inplace=True)
df_ip['Dst IP'].fillna(-1, inplace=True)

# Combine df_ip back with the other non-numeric columns for encoding
df_non_numeric = dataframes[non_numeric_columns].copy()
df_combined = pd.concat([df_ip, df_non_numeric], axis=1)



# Check the resulting dataframe
df_ip

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ip['Src IP'].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ip['Dst IP'].fillna(-1, inplace=True)


Unnamed: 0,Src IP,Dst IP
0,0.752564,0.093188
1,0.752564,0.203627
2,0.752564,0.090648
3,0.752564,0.090648
4,0.752564,0.139262
...,...,...
17482,0.807849,0.752564
17483,0.752564,0.031373
17484,0.807849,0.752564
17485,0.752564,0.031373


### Eliminating correlated numeric features.

In [32]:
import pandas as pd
import numpy as np

# Assuming df_cleaned is your DataFrame after removing null-valued columns
# Compute correlation matrix
df_cleaned_corr = dataframes.drop(columns=["Flow ID","Src IP","Dst IP","Timestamp" , "Activity" , "Stage" ])
correlation_matrix = df_cleaned_corr.corr().abs()

# Select upper triangle of correlation matrix
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool_))

# Find features with correlation greater than threshold (e.g., 0.95)
threshold = 0.95
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

# Drop correlated features
df_uncorrelated = df_cleaned_corr.drop(columns=correlated_features)

# Check the shape of the DataFrame after dropping correlated features
print("Shape of DataFrame after dropping correlated features:", df_uncorrelated.shape)

Shape of DataFrame after dropping correlated features: (17487, 64)


### Veiw uncorrelated features.

In [35]:
df_uncorrelated.columns

Index(['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Mean',
       'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Packets/s', 'Packet Length Min', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'ACK Flag Count',
       'URG Flag Count', 'CWR Flag Count', 'Down/Up Ratio',
       'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg',
       'Bwd Bytes/Bulk Avg', 'Bwd Packet/Bulk A

### Combine features.

In [36]:
# Combine the encoded IP and non-IP columns with other numeric features
X = pd.concat([df_ip, dtimestamp, df_uncorrelated], axis=1)

# Check the resulting dataframe
X.head()

Unnamed: 0,Src IP,Dst IP,Year,Month,Day,Hour,Minute,Second,Src Port,Dst Port,...,Active Min,Idle Std,Year.1,Month.1,Day.1,Hour.1,Minute.1,Second.1,DayOfWeek,IsWeekend
0,0.752564,0.093188,2019,7,17,14,35,9,55438,80,...,15786.0,0.0,2019,7,17,14,35,9,2,0
1,0.752564,0.203627,2019,7,17,14,33,27,43846,443,...,0.0,0.0,2019,7,17,14,33,27,2,0
2,0.752564,0.090648,2019,7,17,14,35,41,48386,443,...,277958.0,0.0,2019,7,17,14,35,41,2,0
3,0.752564,0.090648,2019,7,17,14,36,21,48386,443,...,0.0,0.0,2019,7,17,14,36,21,2,0
4,0.752564,0.139262,2019,7,17,14,35,41,36318,443,...,41653.0,356527.496757,2019,7,17,14,35,41,2,0


### Data Balancing

In [37]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import ClusterCentroids

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [39]:
X_scaled.head()

Unnamed: 0,Src IP,Dst IP,Year,Month,Day,Hour,Minute,Second,Src Port,Dst Port,...,Active Min,Idle Std,Year.1,Month.1,Day.1,Hour.1,Minute.1,Second.1,DayOfWeek,IsWeekend
0,0.889422,0.082435,0.0,0.0,0.0,0.0,0.59322,0.152542,0.847261,0.001223,...,0.000492,0.0,0.0,0.0,0.0,0.0,0.59322,0.152542,0.0,0.0
1,0.889422,0.194184,0.0,0.0,0.0,0.0,0.559322,0.457627,0.6701,0.00677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.559322,0.457627,0.0,0.0
2,0.889422,0.079865,0.0,0.0,0.0,0.0,0.59322,0.694915,0.739485,0.00677,...,0.008671,0.0,0.0,0.0,0.0,0.0,0.59322,0.694915,0.0,0.0
3,0.889422,0.079865,0.0,0.0,0.0,0.0,0.610169,0.355932,0.739485,0.00677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.610169,0.355932,0.0,0.0
4,0.889422,0.129055,0.0,0.0,0.0,0.0,0.59322,0.694915,0.55505,0.00677,...,0.001299,0.008052,0.0,0.0,0.0,0.0,0.59322,0.694915,0.0,0.0
