In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
def preprocess_ports_data(data):
    known_ports = [20, 21, 22, 23, 25, 53, 80, 110, 443, 465, 587, 993, 995, 1433, 3306]

    for port in known_ports:
        data['sourcePort_' + str(port)] = (data['sourceTransportPort'] == port).astype(int)
        data['destinationPort_' + str(port)] = (data['destinationTransportPort'] == port).astype(int)

    return data

In [3]:
def extract_tcp_flag_features(df):
    flag_map = ['F', 'S', 'R', 'P', 'A', 'U', 'E', 'C'] 
    for flag in flag_map:
        df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
    return df

In [4]:
def calculate_unique_ports_per_source_ip(data):
    unique_ports_per_source_ip = data.groupby('sourceIPAddress')['destinationTransportPort'].nunique()

    unique_ports_per_ip_source_df = unique_ports_per_source_ip.reset_index()
    unique_ports_per_ip_source_df.columns = ['sourceIPAddress', 'uniqueDestinationPorts']

    return unique_ports_per_ip_source_df

In [5]:
def calculate_unique_destination_ips_per_source_ip(data):
    unique_destination_ips_per_source_ip = data.groupby('sourceIPAddress')['destinationIPAddress'].nunique()

    unique_destination_ips_per_source_ip_df = unique_destination_ips_per_source_ip.reset_index()
    unique_destination_ips_per_source_ip_df.columns = ['sourceIPAddress', 'uniqueDestinationIPs']

    return unique_destination_ips_per_source_ip_df

In [6]:
def calculate_total_packet_count(data):
    total_packet_count = data.groupby(['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])['packetTotalCount'].sum()

    total_packet_count_df = total_packet_count.reset_index()
    total_packet_count_df.columns = ['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort', 'packetTotalCountAllFlows']

    return total_packet_count_df

In [7]:
def entropy(series):
    probs = series.value_counts(normalize=True)
    return -np.sum(probs * np.log2(probs))

def calculate_port_entropy_per_source_ip(data):
    port_entropy = data.groupby('sourceIPAddress')['destinationTransportPort'].agg(port_entropy_per_source_ip=entropy)

    port_entropy_df = port_entropy.reset_index()
    port_entropy_df.columns = ['sourceIPAddress', 'portEntropy']
    return port_entropy_df

In [8]:
def compute_time_based_features(data):
    max_time = data.groupby(['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])['flowStartMilliseconds'].max()
    min_time = data.groupby(['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])['flowStartMilliseconds'].min()
    
    total_time_per_flow = (max_time - min_time)
    total_time_per_flow.name = 'totalCommunicationTime'
    
    avg_packets_count = data.groupby(['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])['packetTotalCount'].count() / total_time_per_flow.replace(0, -1)
    avg_packets_count.name = 'avgPackets'
    
    avg_total_packets_count = data.groupby(['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])['packetTotalCount'].sum() / total_time_per_flow.replace(0, -1)
    avg_total_packets_count.name = 'avgTotalPackets'
    
    data = pd.merge(data, total_time_per_flow.reset_index(), how='left', on=['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])
    data = pd.merge(data, avg_packets_count.reset_index(), how='left', on=['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])
    data = pd.merge(data, avg_total_packets_count.reset_index(), how='left', on=['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'])
    
    return data

In [9]:
def perform_feature_engineering(data):
    unique_ports_per_source_ip = calculate_unique_ports_per_source_ip(data)
    unique_destination_ips_per_source_ip = calculate_unique_destination_ips_per_source_ip(data)
    total_packet_count = calculate_total_packet_count(data)
    port_entropy = calculate_port_entropy_per_source_ip(data)
    
    data = pd.merge(data, unique_ports_per_source_ip, on='sourceIPAddress', how='left')
    data = pd.merge(data, unique_destination_ips_per_source_ip, on='sourceIPAddress', how='left')
    data = pd.merge(data, port_entropy, on='sourceIPAddress', how='left')
    data = pd.merge(data, total_packet_count, on=['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort'], how='left')
    return data

In [10]:
def preprocess_data(data):
    data = perform_feature_engineering(data)
    data = compute_time_based_features(data)
    data = preprocess_ports_data(data)
    data = extract_tcp_flag_features(data)

    return data.drop(columns=['sourceIPAddress', 'destinationIPAddress', 'sourceTransportPort', 'destinationTransportPort', 'flowStartMilliseconds', '_tcpFlags'], axis=1)

In [11]:
flows = pd.read_csv("training_flows.csv")
labels = pd.read_csv("flowkeys_training_labeled_enc.csv")

In [12]:
flows.head()

Unnamed: 0,flowStartMilliseconds,sourceIPAddress,destinationIPAddress,sourceTransportPort,destinationTransportPort,packetTotalCount,_tcpFlags
0,1619932200043,192.249.218.19,163.93.29.231,35398,5632,1,
1,1619932200044,149.200.25.158,163.93.105.66,1337,36437,1,SA
2,1619932200044,202.134.217.69,187.64.69.120,53,44730,1,
3,1619932200045,172.82.240.244,203.5.198.36,49654,1900,1,
4,1619932200046,66.103.92.35,133.227.166.5,80,12541,1,SA


In [13]:
labels.head()

Unnamed: 0,flowStartMilliseconds,sourceIPAddress,destinationIPAddress,sourceTransportPort,destinationTransportPort,Binary_Label,Attack_Type_enc
0,1619932200043,192.249.218.19,163.93.29.231,35398,5632,0,Normal
1,1619932200044,149.200.25.158,163.93.105.66,1337,36437,0,Normal
2,1619932200044,202.134.217.69,187.64.69.120,53,44730,0,Normal
3,1619932200045,172.82.240.244,203.5.198.36,49654,1900,0,Normal
4,1619932200046,66.103.92.35,133.227.166.5,80,12541,0,Normal


In [14]:
# Merge
key_cols = ["flowStartMilliseconds", "sourceIPAddress", "destinationIPAddress", "sourceTransportPort", "destinationTransportPort"]
merged = pd.merge(
    flows,
    labels,
    on=key_cols,
    how="inner"
)

X = merged.drop(columns=["Attack_Type_enc", "Binary_Label"], errors="ignore")
y = merged["Attack_Type_enc"]

X.head()

Unnamed: 0,flowStartMilliseconds,sourceIPAddress,destinationIPAddress,sourceTransportPort,destinationTransportPort,packetTotalCount,_tcpFlags
0,1619932200043,192.249.218.19,163.93.29.231,35398,5632,1,
1,1619932200044,149.200.25.158,163.93.105.66,1337,36437,1,SA
2,1619932200044,202.134.217.69,187.64.69.120,53,44730,1,
3,1619932200045,172.82.240.244,203.5.198.36,49654,1900,1,
4,1619932200046,66.103.92.35,133.227.166.5,80,12541,1,SA


In [15]:
X = preprocess_data(X)
X.head()

  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)
  df['tcpFlag_' + str(flag)] = df['_tcpFlags'].str.contains(flag).fillna(False).astype(int)


Unnamed: 0,packetTotalCount,uniqueDestinationPorts,uniqueDestinationIPs,portEntropy,packetTotalCountAllFlows,totalCommunicationTime,avgPackets,avgTotalPackets,sourcePort_20,destinationPort_20,...,sourcePort_3306,destinationPort_3306,tcpFlag_F,tcpFlag_S,tcpFlag_R,tcpFlag_P,tcpFlag_A,tcpFlag_U,tcpFlag_E,tcpFlag_C
0,1,1,27901,-0.0,1,0,-1.0,-1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,646,664,8.29481,1,0,-1.0,-1.0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,1,6,5,2.584963,1,0,-1.0,-1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,5213,-0.0,1,0,-1.0,-1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,406,458,8.61213,1,0,-1.0,-1.0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

class_weights = {
    "C1": 5.0,
    "C2": 5.0,
    "C3": 5.0,
    "C4": 5.0,
    "C6": 5.0,
    "C7": 5.0,
    "Normal": 0.5
}

clf = RandomForestClassifier(n_estimators=400, max_depth=30, random_state=42, class_weight=class_weights)
#clf = HistGradientBoostingClassifier(
#    max_iter=300,
#    learning_rate=0.05,
#    max_depth=10,
#    l2_regularization=1.0,
#    early_stopping=True,
#    random_state=42
#)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

          C1       0.28      1.00      0.44      4449
          C2       1.00      1.00      1.00       150
          C3       0.49      1.00      0.66      3674
          C4       0.81      1.00      0.90        64
          C6       1.00      1.00      1.00       603
          C7       0.00      0.00      0.00        10
      Normal       1.00      0.89      0.94    136175

    accuracy                           0.90    145125
   macro avg       0.65      0.84      0.71    145125
weighted avg       0.96      0.90      0.92    145125

[[  4449      0      0      0      0      0      0]
 [     0    150      0      0      0      0      0]
 [     0      0   3674      0      0      0      0]
 [     0      0      0     64      0      0      0]
 [     0      0      0      0    603      0      0]
 [     0      0      0      0      0      0     10]
 [ 11341      0   3771     15      2      0 121046]]


In [17]:
import joblib
joblib.dump(clf, "model_new11.joblib")

['model_new11.joblib']