In [1]:
import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


In [2]:
dataDir = "../data generation/Tstat/stdin/"
datafile = "/log_tcp_complete"
poolIPsFile = "../poolIP.txt"

In [3]:
## Aggregate Data
raw_df = pd.DataFrame()
for datafolder in sorted(os.listdir(dataDir)):
    if datafolder > '2022_05_16_02_42.out':
        break
    dataPointFile = dataDir + datafolder + datafile
    raw_df = pd.concat([raw_df, pd.read_csv(dataPointFile, sep = ' ')])

In [4]:
raw_df = raw_df.reset_index(drop=True)

In [5]:
raw_df

Unnamed: 0,#09#c_ip:1,c_port:2,c_pkts_all:3,c_rst_cnt:4,c_ack_cnt:5,c_ack_cnt_p:6,c_bytes_uniq:7,c_pkts_data:8,c_bytes_all:9,c_pkts_retx:10,...,s_cwin_ini:95,s_pkts_rto:96,s_pkts_fs:97,s_pkts_reor:98,s_pkts_dup:99,s_pkts_unk:100,s_pkts_fc:101,s_pkts_unrto:102,s_pkts_unfs:103,s_syn_retx:104
0,192.168.0.92,43432,12,0,11,5,2473,5,2473,0,...,1460,0,0,1,0,0,0,0,0,0
1,192.168.0.92,43434,12,0,11,5,2595,5,2595,0,...,1460,0,0,1,0,0,0,0,0,0
2,192.168.0.92,45634,16,0,15,7,2999,7,5292,2,...,2920,0,0,1,0,2,0,0,0,0
3,192.168.0.92,45636,12,0,11,5,2765,5,2765,0,...,4435,0,0,1,0,1,0,0,0,0
4,192.168.0.92,47230,10,0,9,6,675,2,675,0,...,2920,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11017,192.168.0.92,38492,36,0,35,27,1392,8,1392,0,...,150,0,0,1,0,0,0,0,0,0
11018,192.168.0.92,35410,13,0,12,7,2147,5,2147,0,...,5920,0,0,0,0,0,0,0,0,0
11019,192.168.0.92,54440,22,1,20,14,1419,5,1419,0,...,2896,0,0,0,0,0,0,0,0,0
11020,192.168.0.92,54448,23,0,22,10,4795,10,4795,1,...,392,1,0,0,0,0,0,0,0,0


In [6]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11022 entries, 0 to 11021
Columns: 104 entries, #09#c_ip:1 to s_syn_retx:104
dtypes: float64(17), int64(85), object(2)
memory usage: 8.7+ MB


In [7]:
print(raw_df.columns.tolist())

['#09#c_ip:1', 'c_port:2', 'c_pkts_all:3', 'c_rst_cnt:4', 'c_ack_cnt:5', 'c_ack_cnt_p:6', 'c_bytes_uniq:7', 'c_pkts_data:8', 'c_bytes_all:9', 'c_pkts_retx:10', 'c_bytes_retx:11', 'c_pkts_ooo:12', 'c_syn_cnt:13', 'c_fin_cnt:14', 's_ip:15', 's_port:16', 's_pkts_all:17', 's_rst_cnt:18', 's_ack_cnt:19', 's_ack_cnt_p:20', 's_bytes_uniq:21', 's_pkts_data:22', 's_bytes_all:23', 's_pkts_retx:24', 's_bytes_retx:25', 's_pkts_ooo:26', 's_syn_cnt:27', 's_fin_cnt:28', 'first:29', 'last:30', 'durat:31', 'c_first:32', 's_first:33', 'c_last:34', 's_last:35', 'c_first_ack:36', 's_first_ack:37', 'c_isint:38', 's_isint:39', 'c_iscrypto:40', 's_iscrypto:41', 'con_t:42', 'p2p_t:43', 'http_t:44', 'c_rtt_avg:45', 'c_rtt_min:46', 'c_rtt_max:47', 'c_rtt_std:48', 'c_rtt_cnt:49', 'c_ttl_min:50', 'c_ttl_max:51', 's_rtt_avg:52', 's_rtt_min:53', 's_rtt_max:54', 's_rtt_std:55', 's_rtt_cnt:56', 's_ttl_min:57', 's_ttl_max:58', 'c_f1323_opt:59', 'c_tm_opt:60', 'c_win_scl:61', 'c_sack_opt:62', 'c_sack_cnt:63', 'c_mss:64

In [8]:
f = open(dataDir + poolIPsFile, 'r')
poolIPs = f.read().splitlines()
print(poolIPs)

['178.32.120.127', '51.254.84.37', '51.68.21.186', '94.130.165.85', '94.130.164.163', '94.130.165.87', '94.130.164.163', '178.32.120.127', '94.130.164.163', '136.243.49.177', '51.68.21.186', '94.130.164.163', '136.243.49.177', '136.243.49.177', '94.130.164.163', '51.68.21.188', '178.32.120.127', '51.254.84.37', '51.254.84.37', '94.130.165.85', '51.68.21.188', '51.68.21.186', '94.130.165.85', '136.243.49.177', '51.254.84.37', '51.254.84.37', '94.130.165.87', '178.32.120.127', '136.243.49.177', '178.32.120.127', '178.32.120.127', '51.68.21.186', '51.254.84.37', '94.130.164.163', '51.68.21.186', '94.130.164.163', '94.130.164.163', '51.68.21.188', '51.68.21.188', '178.32.120.127', '51.68.21.186', '51.68.21.188', '51.68.21.188', '51.254.84.37', '51.254.84.37', '51.68.21.186', '51.68.21.188', '51.68.21.186', '51.254.84.37', '51.68.21.186', '51.254.84.37', '178.32.120.127', '178.32.120.127', '94.130.164.163', '136.243.49.177', '178.32.120.127', '51.254.84.37', '51.68.21.186', '51.254.84.37', 

In [9]:
def is_pool(x):
    return 1 if x in poolIPs else 0

raw_df["s_ip:15"] = raw_df["s_ip:15"].apply(is_pool)

In [10]:
print("positive labels: ", sum(raw_df["s_ip:15"]))

positive labels:  202


In [11]:
corr_matrix = raw_df.corr()

In [12]:
print(corr_matrix["s_ip:15"].sort_values(ascending=False))

s_ip:15            1.000000
c_rst_cnt:4        0.199001
c_mss_min:66       0.196438
c_cwin_min:71      0.193658
s_tm_opt:83        0.076739
                     ...   
c_pkts_unfs:80          NaN
c_syn_retx:81           NaN
s_pkts_fc:101           NaN
s_pkts_unfs:103         NaN
s_syn_retx:104          NaN
Name: s_ip:15, Length: 103, dtype: float64


In [13]:
raw_df_prooned = raw_df.drop(["#09#c_ip:1", "c_port:2", "s_port:16", 'c_pkts_ooo:12',
 'c_isint:38', 's_isint:39', 'c_iscrypto:40', 's_iscrypto:41', 'p2p_t:43',
  'c_ttl_min:50', 'c_ttl_max:51', 'c_f1323_opt:59', 'c_tm_opt:60',
   'c_sack_opt:62', 'c_mss:64', 'c_win_0:69', 'c_pkts_reor:75',
    'c_pkts_dup:76', 'c_pkts_fc:78', 'c_pkts_unrto:79',
     'c_pkts_unfs:80', 'c_syn_retx:81', 's_pkts_fc:101',
      's_pkts_unfs:103', 's_syn_retx:104'], axis=1)

In [14]:
raw_df_prooned.columns

Index(['c_pkts_all:3', 'c_rst_cnt:4', 'c_ack_cnt:5', 'c_ack_cnt_p:6',
       'c_bytes_uniq:7', 'c_pkts_data:8', 'c_bytes_all:9', 'c_pkts_retx:10',
       'c_bytes_retx:11', 'c_syn_cnt:13', 'c_fin_cnt:14', 's_ip:15',
       's_pkts_all:17', 's_rst_cnt:18', 's_ack_cnt:19', 's_ack_cnt_p:20',
       's_bytes_uniq:21', 's_pkts_data:22', 's_bytes_all:23', 's_pkts_retx:24',
       's_bytes_retx:25', 's_pkts_ooo:26', 's_syn_cnt:27', 's_fin_cnt:28',
       'first:29', 'last:30', 'durat:31', 'c_first:32', 's_first:33',
       'c_last:34', 's_last:35', 'c_first_ack:36', 's_first_ack:37',
       'con_t:42', 'http_t:44', 'c_rtt_avg:45', 'c_rtt_min:46', 'c_rtt_max:47',
       'c_rtt_std:48', 'c_rtt_cnt:49', 's_rtt_avg:52', 's_rtt_min:53',
       's_rtt_max:54', 's_rtt_std:55', 's_rtt_cnt:56', 's_ttl_min:57',
       's_ttl_max:58', 'c_win_scl:61', 'c_sack_cnt:63', 'c_mss_max:65',
       'c_mss_min:66', 'c_win_max:67', 'c_win_min:68', 'c_cwin_max:70',
       'c_cwin_min:71', 'c_cwin_ini:72', 'c_pkts_r

In [15]:
flow_df = raw_df_prooned.drop(["s_ip:15"], axis=1)
labels = raw_df_prooned["s_ip:15"].copy()

In [16]:
flow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11022 entries, 0 to 11021
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   c_pkts_all:3      11022 non-null  int64  
 1   c_rst_cnt:4       11022 non-null  int64  
 2   c_ack_cnt:5       11022 non-null  int64  
 3   c_ack_cnt_p:6     11022 non-null  int64  
 4   c_bytes_uniq:7    11022 non-null  int64  
 5   c_pkts_data:8     11022 non-null  int64  
 6   c_bytes_all:9     11022 non-null  int64  
 7   c_pkts_retx:10    11022 non-null  int64  
 8   c_bytes_retx:11   11022 non-null  int64  
 9   c_syn_cnt:13      11022 non-null  int64  
 10  c_fin_cnt:14      11022 non-null  int64  
 11  s_pkts_all:17     11022 non-null  int64  
 12  s_rst_cnt:18      11022 non-null  int64  
 13  s_ack_cnt:19      11022 non-null  int64  
 14  s_ack_cnt_p:20    11022 non-null  int64  
 15  s_bytes_uniq:21   11022 non-null  int64  
 16  s_pkts_data:22    11022 non-null  int64 

In [17]:
standardScaler = StandardScaler()
flow_df = standardScaler.fit_transform(flow_df)

In [18]:
treeClassifier = DecisionTreeClassifier()

In [19]:
# scores = cross_val_score(treeClassifier, flow_df, labels, scoring="neg_mean_squared_error", cv=10)
# tree_rmse_scores = np.sqrt(-scores)
flow_pred_labels = cross_val_predict(treeClassifier, flow_df, labels, cv=10)

In [20]:
confusion_matrix(labels, flow_pred_labels)

array([[10819,     1],
       [    4,   198]])

In [21]:
print("Precision:", precision_score(labels, flow_pred_labels))
print("Recall:", recall_score(labels, flow_pred_labels))
print("F1:", f1_score(labels, flow_pred_labels))

Precision: 0.9949748743718593
Recall: 0.9801980198019802
F1: 0.9875311720698253


In [22]:
treeClassifier.fit(flow_df, labels)

In [23]:
test_raw_df = pd.DataFrame()
for datafolder in sorted(os.listdir(dataDir)):
    if datafolder > '2022_05_16_02_42.out':
        print(datafolder)
        dataPointFile = dataDir + datafolder + datafile
        test_raw_df = pd.concat([test_raw_df, pd.read_csv(dataPointFile, sep = ' ')])
test_raw_df = test_raw_df.reset_index(drop=True)

2022_05_16_03_43.out
2022_05_16_04_43.out
2022_05_16_05_43.out
2022_05_16_06_43.out
2022_05_16_07_43.out
2022_05_16_08_44.out
2022_05_16_09_44.out
2022_05_16_10_44.out
2022_05_16_11_44.out
2022_05_16_12_44.out


In [24]:
f = open(dataDir + poolIPsFile, 'r')
poolIPs = f.read().splitlines()
print(poolIPs)

['178.32.120.127', '51.254.84.37', '51.68.21.186', '94.130.165.85', '94.130.164.163', '94.130.165.87', '94.130.164.163', '178.32.120.127', '94.130.164.163', '136.243.49.177', '51.68.21.186', '94.130.164.163', '136.243.49.177', '136.243.49.177', '94.130.164.163', '51.68.21.188', '178.32.120.127', '51.254.84.37', '51.254.84.37', '94.130.165.85', '51.68.21.188', '51.68.21.186', '94.130.165.85', '136.243.49.177', '51.254.84.37', '51.254.84.37', '94.130.165.87', '178.32.120.127', '136.243.49.177', '178.32.120.127', '178.32.120.127', '51.68.21.186', '51.254.84.37', '94.130.164.163', '51.68.21.186', '94.130.164.163', '94.130.164.163', '51.68.21.188', '51.68.21.188', '178.32.120.127', '51.68.21.186', '51.68.21.188', '51.68.21.188', '51.254.84.37', '51.254.84.37', '51.68.21.186', '51.68.21.188', '51.68.21.186', '51.254.84.37', '51.68.21.186', '51.254.84.37', '178.32.120.127', '178.32.120.127', '94.130.164.163', '136.243.49.177', '178.32.120.127', '51.254.84.37', '51.68.21.186', '51.254.84.37', 

In [25]:
test_raw_df

Unnamed: 0,#09#c_ip:1,c_port:2,c_pkts_all:3,c_rst_cnt:4,c_ack_cnt:5,c_ack_cnt_p:6,c_bytes_uniq:7,c_pkts_data:8,c_bytes_all:9,c_pkts_retx:10,...,s_cwin_ini:95,s_pkts_rto:96,s_pkts_fs:97,s_pkts_reor:98,s_pkts_dup:99,s_pkts_unk:100,s_pkts_fc:101,s_pkts_unrto:102,s_pkts_unfs:103,s_syn_retx:104
0,192.168.0.92,46662,26,0,25,18,1848,6,1848,0,...,99,0,0,2,0,1,0,0,0,0
1,192.168.0.92,54450,21,0,20,10,2125,9,2125,0,...,392,0,0,0,0,0,0,0,0,0
2,192.168.0.92,54442,24,0,23,12,4361,9,4361,1,...,392,0,0,0,0,0,0,0,0,0
3,192.168.0.92,54444,32,0,31,17,7120,13,7120,0,...,392,0,0,0,0,1,0,0,0,0
4,192.168.0.92,54430,33,0,32,22,3061,9,3061,0,...,392,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2128,192.168.0.92,57524,40,1,38,26,2645,11,2645,0,...,392,0,0,0,0,0,0,0,0,0
2129,192.168.0.92,57544,10,1,8,3,1354,4,1354,0,...,392,0,0,0,0,0,0,0,0,0
2130,192.168.0.92,57546,13,1,11,6,1354,4,1354,0,...,1448,0,0,1,0,0,0,0,0,0
2131,192.168.0.92,57548,13,1,11,6,1354,4,1354,0,...,2896,0,0,1,0,0,0,0,0,0


In [26]:
test_raw_df["s_ip:15"] = test_raw_df["s_ip:15"].apply(is_pool)

In [27]:
test_raw_df_prooned = test_raw_df.drop(["#09#c_ip:1", "c_port:2", "s_port:16", 'c_pkts_ooo:12',
 'c_isint:38', 's_isint:39', 'c_iscrypto:40', 's_iscrypto:41', 'p2p_t:43',
  'c_ttl_min:50', 'c_ttl_max:51', 'c_f1323_opt:59', 'c_tm_opt:60',
   'c_sack_opt:62', 'c_mss:64', 'c_win_0:69', 'c_pkts_reor:75',
    'c_pkts_dup:76', 'c_pkts_fc:78', 'c_pkts_unrto:79',
     'c_pkts_unfs:80', 'c_syn_retx:81', 's_pkts_fc:101',
      's_pkts_unfs:103', 's_syn_retx:104'], axis=1)

In [28]:
test_flow_df = test_raw_df_prooned.drop(["s_ip:15"], axis=1)
test_labels = test_raw_df_prooned["s_ip:15"].copy()

In [29]:
test_flow_df = standardScaler.transform(test_flow_df)

In [30]:
test_pred_labels = treeClassifier.predict(test_flow_df)

In [31]:
sum(test_labels)

24

In [32]:
print("Test Precision:", precision_score(test_labels, test_pred_labels))
print("Test Recall:", recall_score(test_labels, test_pred_labels))
print("Test F1:", f1_score(test_labels, test_pred_labels))

Test Precision: 1.0
Test Recall: 0.9583333333333334
Test F1: 0.9787234042553191


In [33]:
confusion_matrix(test_labels, test_pred_labels)

array([[2109,    0],
       [   1,   23]])

In [34]:
randomforestClassifier = RandomForestClassifier(n_estimators=25)

In [35]:
flow_pred_labels = cross_val_predict(randomforestClassifier, flow_df, labels, cv=10)

In [36]:
confusion_matrix(labels, flow_pred_labels)

array([[10815,     5],
       [    4,   198]])

In [37]:
print("Precision:", precision_score(labels, flow_pred_labels))
print("Recall:", recall_score(labels, flow_pred_labels))
print("F1:", f1_score(labels, flow_pred_labels))

Precision: 0.9753694581280788
Recall: 0.9801980198019802
F1: 0.9777777777777779


In [38]:
randomforestClassifier.fit(flow_df, labels)

In [39]:
test_pred_labels = randomforestClassifier.predict(test_flow_df)
print("Test Precision:", precision_score(test_labels, test_pred_labels))
print("Test Recall:", recall_score(test_labels, test_pred_labels))
print("Test F1:", f1_score(test_labels, test_pred_labels))

Test Precision: 1.0
Test Recall: 1.0
Test F1: 1.0
