In [10]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
dataDir = "../data generation/Tstat/stdin/"
datafile = "/log_tcp_complete"
poolIPsFile = "../poolIP.txt"

In [4]:
## Aggregate Data
raw_df = pd.DataFrame()
for datafolder in sorted(os.listdir(dataDir)):
    dataPointFile = dataDir + datafolder + datafile
    raw_df = pd.concat([raw_df, pd.read_csv(dataPointFile, sep = ' ')])

In [5]:
raw_df = raw_df.reset_index(drop=True)

In [6]:
raw_df

Unnamed: 0,#09#c_ip:1,c_port:2,c_pkts_all:3,c_rst_cnt:4,c_ack_cnt:5,c_ack_cnt_p:6,c_bytes_uniq:7,c_pkts_data:8,c_bytes_all:9,c_pkts_retx:10,...,s_cwin_ini:95,s_pkts_rto:96,s_pkts_fs:97,s_pkts_reor:98,s_pkts_dup:99,s_pkts_unk:100,s_pkts_fc:101,s_pkts_unrto:102,s_pkts_unfs:103,s_syn_retx:104
0,192.168.0.92,43432,12,0,11,5,2473,5,2473,0,...,1460,0,0,1,0,0,0,0,0,0
1,192.168.0.92,43434,12,0,11,5,2595,5,2595,0,...,1460,0,0,1,0,0,0,0,0,0
2,192.168.0.92,45634,16,0,15,7,2999,7,5292,2,...,2920,0,0,1,0,2,0,0,0,0
3,192.168.0.92,45636,12,0,11,5,2765,5,2765,0,...,4435,0,0,1,0,1,0,0,0,0
4,192.168.0.92,47230,10,0,9,6,675,2,675,0,...,2920,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13150,192.168.0.92,57524,40,1,38,26,2645,11,2645,0,...,392,0,0,0,0,0,0,0,0,0
13151,192.168.0.92,57544,10,1,8,3,1354,4,1354,0,...,392,0,0,0,0,0,0,0,0,0
13152,192.168.0.92,57546,13,1,11,6,1354,4,1354,0,...,1448,0,0,1,0,0,0,0,0,0
13153,192.168.0.92,57548,13,1,11,6,1354,4,1354,0,...,2896,0,0,1,0,0,0,0,0,0


In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13155 entries, 0 to 13154
Columns: 104 entries, #09#c_ip:1 to s_syn_retx:104
dtypes: float64(17), int64(85), object(2)
memory usage: 10.4+ MB


In [8]:
f = open(dataDir + poolIPsFile, 'r')
poolIPs = f.read().splitlines()

In [9]:
def is_pool(x):
    return 1 if x in poolIPs else 0

raw_df["s_ip:15"] = raw_df["s_ip:15"].apply(is_pool)

In [11]:
raw_df_prooned = raw_df.drop(["#09#c_ip:1", "c_port:2", "s_port:16", 'c_pkts_ooo:12',
 'c_isint:38', 's_isint:39', 'c_iscrypto:40', 's_iscrypto:41', 'p2p_t:43',
  'c_ttl_min:50', 'c_ttl_max:51', 'c_f1323_opt:59', 'c_tm_opt:60',
   'c_sack_opt:62', 'c_mss:64', 'c_win_0:69', 'c_pkts_reor:75',
    'c_pkts_dup:76', 'c_pkts_fc:78', 'c_pkts_unrto:79',
     'c_pkts_unfs:80', 'c_syn_retx:81', 's_pkts_fc:101',
      's_pkts_unfs:103', 's_syn_retx:104'], axis=1)

In [32]:
raw_df_prooned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13155 entries, 0 to 13154
Data columns (total 79 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   c_pkts_all:3      13155 non-null  int64  
 1   c_rst_cnt:4       13155 non-null  int64  
 2   c_ack_cnt:5       13155 non-null  int64  
 3   c_ack_cnt_p:6     13155 non-null  int64  
 4   c_bytes_uniq:7    13155 non-null  int64  
 5   c_pkts_data:8     13155 non-null  int64  
 6   c_bytes_all:9     13155 non-null  int64  
 7   c_pkts_retx:10    13155 non-null  int64  
 8   c_bytes_retx:11   13155 non-null  int64  
 9   c_syn_cnt:13      13155 non-null  int64  
 10  c_fin_cnt:14      13155 non-null  int64  
 11  s_ip:15           13155 non-null  int64  
 12  s_pkts_all:17     13155 non-null  int64  
 13  s_rst_cnt:18      13155 non-null  int64  
 14  s_ack_cnt:19      13155 non-null  int64  
 15  s_ack_cnt_p:20    13155 non-null  int64  
 16  s_bytes_uniq:21   13155 non-null  int64 

In [12]:
train_set, test_set = train_test_split(raw_df_prooned, test_size=0.2, random_state=42)

In [13]:
flow_df = train_set.drop(["s_ip:15"], axis=1)
labels = train_set["s_ip:15"].copy()

In [14]:
test_flow_df = test_set.drop(["s_ip:15"], axis=1)
test_labels = test_set["s_ip:15"].copy()

In [15]:
standardScaler = StandardScaler()
flow_df = standardScaler.fit_transform(flow_df)

In [16]:
test_flow_df = standardScaler.transform(test_flow_df)

# Decision Tree Classifier

In [17]:
treeClassifier = DecisionTreeClassifier()
flow_pred_labels = cross_val_predict(treeClassifier, flow_df, labels, cv=10)
confusion_matrix(labels, flow_pred_labels)

array([[10326,     8],
       [    4,   186]])

## Cross Validation Result:

In [18]:
print("Precision:", precision_score(labels, flow_pred_labels))
print("Recall:", recall_score(labels, flow_pred_labels))
print("F1:", f1_score(labels, flow_pred_labels))

Precision: 0.9587628865979382
Recall: 0.9789473684210527
F1: 0.96875


In [19]:
treeClassifier.fit(flow_df, labels)

In [20]:
test_pred_labels = treeClassifier.predict(test_flow_df)

## Test Results:

In [21]:
print("Test Precision:", precision_score(test_labels, test_pred_labels))
print("Test Recall:", recall_score(test_labels, test_pred_labels))
print("Test F1:", f1_score(test_labels, test_pred_labels))

Test Precision: 1.0
Test Recall: 1.0
Test F1: 1.0


In [22]:
confusion_matrix(test_labels, test_pred_labels)

array([[2595,    0],
       [   0,   36]])

# Random Forest Classifier:

In [24]:
randomforestClassifier = RandomForestClassifier(n_estimators=25)
flow_pred_labels = cross_val_predict(randomforestClassifier, flow_df, labels, cv=10)
confusion_matrix(labels, flow_pred_labels)

array([[10334,     0],
       [    2,   188]])

## Cross Validation Result:

In [25]:
print("Precision:", precision_score(labels, flow_pred_labels))
print("Recall:", recall_score(labels, flow_pred_labels))
print("F1:", f1_score(labels, flow_pred_labels))

Precision: 1.0
Recall: 0.9894736842105263
F1: 0.9947089947089947


In [26]:
randomforestClassifier.fit(flow_df, labels)

In [27]:
test_pred_labels = randomforestClassifier.predict(test_flow_df)

## Test Results:

In [28]:
print("Test Precision:", precision_score(test_labels, test_pred_labels))
print("Test Recall:", recall_score(test_labels, test_pred_labels))
print("Test F1:", f1_score(test_labels, test_pred_labels))

Test Precision: 1.0
Test Recall: 1.0
Test F1: 1.0


In [29]:
confusion_matrix(test_labels, test_pred_labels)

array([[2595,    0],
       [   0,   36]])