**NetFlow-Conversion**
- Creating Series of nfdump commands to convert raw NetFlow nfcapd files into CSV format.
- Remove last 3 lines containing the summary in every file.
- Convert data types of numerical features to "float" data type
- Group every NetFlow files by - " Source IP address, Destination IP address, Source Port, Destination Port and Protocol".
- Assign values to Output packets (opkt) and Output bytes (obyt).
- Concatenate all the converted NetFlow files into a single file.
- Create features (Src_value and Dst_value) and assign port number descriptions.
- Create features (Device_Src and Device_Dst) and assign device labels.
- Divide dataset containg datastreams of "Host-class-Devices" adn "Unknown-Devices".

In [None]:
#Import Libraries and Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
import sys
import glob
import shutil
import time
import json
import ipaddress
import socket, struct
import seaborn as sns
from pathlib import Path
from numpy import array
from random import random
from sklearn.compose import ColumnTransformer
from google.colab import files

1. Creating Series of nfdump commands to convert raw NetFlow nfcapd files into CSV format.

In [None]:
# Get the names of nfcapd files and convert it into an array.
nfcapdarr = nfcapdfilename['nfdumpcsv'].unique()

In [None]:
nfcapdarr.nunique()

In [None]:
# Create the command and loop through the number of arrays to create commands. Store the output to the file nfcapd.txt
sys.stdout = open("nfcapd.txt", "w")

for nfcapd in nfcapdarr:
  print('nfdump -r /var/cache/nfdump/{0} -o csv > nfcapd_csv/{0}.csv'.format(nfcapd))

sys.stdout.close()

In [None]:
# Create Aggregate function with features - td, ipkt, opkt, obyt, stos, in, out. To assign to the grouped NetFlow files.
aggregation_functions = {'td':'sum','ipkt':'sum','ibyt':'sum','opkt':'sum','obyt':'sum','stos':'first','in':'first', 'out':'first'}

2. Remove last 3 lines containing the summary in every file.
- Convert data types of numerical features to "float" data type.
- Group every NetFlow files by - " Source IP address, Destination IP address, Source Port, Destination Port and Protocol".

In [None]:
# Takes the NetFlow files from the source directory convert and applies functions to all the files and saves it in the output directory.
source_dir = Path('/content/drive/MyDrive/Master_Thesis/Test-Netflow')
output_dir = Path('/content/drive/MyDrive/Master_Thesis/Test-Netflow/Preprocessed 1')

for file in source_dir.glob('*.csv'):
    df = pd.read_csv(file)
    df.drop(df.tail(3).index, inplace=True)
    df['stos'] = df.stos.astype(float)
    df['td'] = df.td.astype(float)
    df['ipkt'] = df.ipkt.astype(float)
    df['ibyt'] = df.ibyt.astype(float)
    df['opkt'] = df.opkt.astype(float)
    df['obyt'] = df.obyt.astype(float)
    df['in'] = df['in'].astype(float)
    df['out'] = df.out.astype(float)
    df = df.groupby(['sa','da','sp','dp','pr'],as_index=False).aggregate(aggregation_functions)
    df.to_csv(output_dir.joinpath(file.name), index=False)

Assign values to Output packets (opkt) and Output bytes (obyt).

In [None]:
# Takes the NetFlies from the source directory which has the converted files from the previous process and assigns the "opkt" adn "obyt" values to each flows in every files.
source_dir = Path('/content/drive/MyDrive/Master_Thesis/Test-Netflow/Preprocessed 1')
output_dir = Path('/content/drive/MyDrive/Master_Thesis/Test-Netflow/Preprocessed 2')

for file in source_dir.glob('*.csv'):
    netflow = pd.read_csv(file)
    for row1 in range(len(netflow)):
       sa1 = netflow.at[row1,"sa"]
       da1 = netflow.at[row1,"da"]
       sp1 = netflow.at[row1,"sp"]
       dp1 = netflow.at[row1,"dp"]
       pr1 = netflow.at[row1,"pr"]

       for row2 in range(len(netflow)):
           sa2 = netflow.at[row2,"sa"]
           da2 = netflow.at[row2,"da"]
           sp2 = netflow.at[row2,"sp"]
           dp2 = netflow.at[row2,"dp"]
           pr2 = netflow.at[row2,"pr"]

           if sa1 == da2 and da1 == sa2 and sp1 == dp2 and dp1 == sp2 and pr1 == pr2:
              netflow.at[row1,"opkt"] = netflow.at[row2,"ipkt"]
              netflow.at[row1,"obyt"] = netflow.at[row2,"ibyt"]
              
    netflow.to_csv(output_dir.joinpath(file.name), index=False)

3. Concatenate all the converted NetFlow files into a single file.

In [None]:
#Concatenate all files from the completel converted and derived NetFlow files into a single file.
os.chdir("/content/drive/MyDrive/Master_Thesis/Test-Netflow/1Hour")

extension = 'csv'
all_filenames = [i
   for i in glob.glob('*.{}'.format(extension))
]

#combine all files in the list
netflow_data = pd.concat([pd.read_csv(f) for f in all_filenames])
#export to csv
netflow_data.to_csv("Test-Data-1-Hour.csv", index = False, encoding = 'utf-8-sig')

In [None]:
netflow = pd.read_csv('/content/drive/MyDrive/Master_Thesis/Test-Netflow/1Hour/Test-Data-1-Hour.csv')

In [None]:
netflow.shape

(16529, 13)

In [None]:
netflow.head(50)

Unnamed: 0,sa,da,sp,dp,pr,td,ipkt,ibyt,opkt,obyt,stos,in,out
0,0.0.0.0,255.255.255.255,5678,5678.0,UDP,0.0,5.0,825.0,0.0,0.0,0.0,13.0,0.0
1,0.0.0.0,255.255.255.255,68,67.0,UDP,30.27,9.0,2952.0,0.0,0.0,0.0,6.0,0.0
2,104.108.144.60,44.149.43.210,443,59122.0,TCP,15.09,5.0,400.0,5.0,260.0,192.0,3.0,6.0
3,168.235.104.115,44.149.43.199,80,34518.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
4,168.235.104.115,44.149.43.199,80,34524.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
5,168.235.104.115,44.149.43.199,80,34526.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
6,168.235.104.115,44.149.43.199,80,34532.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
7,168.235.104.115,44.149.43.199,80,34534.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
8,185.199.108.153,44.149.43.199,443,50422.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0
9,185.199.108.153,44.149.43.199,443,50428.0,TCP,0.0,1.0,88.0,1.0,60.0,192.0,3.0,6.0


In [None]:
netflow.Device_Dst.value_counts()

Unknown_Device         459326
Mikrotik-BGP-Router    109119
Linux-Raspbian          55015
Linux-Debian            18225
Linux-Ubuntu            17197
Windows                  8476
SDR-RedPitaya            3873
VOIP                     1942
Mikrotik-Router          1896
Name: Device_Dst, dtype: int64

In [None]:
netflow.columns

Index(['sa', 'da', 'sp', 'dp', 'pr', 'td', 'ipkt', 'ibyt', 'opkt', 'obyt',
       'stos', 'in', 'out', 'Src_value', 'Dst_value', 'Device_Src',
       'Device_Dst'],
      dtype='object')

In [None]:
netflow.to_csv('NetFlowStreams.csv', index= False)

4. Create features (Src_value and Dst_value) and assign port number descriptions.

In [None]:
#Import Port Numbers dataset
port_values = pd.read_csv('/content/Port numbers 1.csv')
Host_names = pd.read_csv('/content/Device-Types.csv')

In [None]:
#Converting Port numbers and description into a dictionary
port2value = zip(port_values['Port'], port_values['Description'])
port_description = list(port2value)
Port_value = dict(port_description)
Port_value

{0: 'In programming APIs- requests a system-allocated (dynamic) port',
 1: 'TCP Port Service Multiplexer (TCPMUX)',
 2: 'CompressNET Management Utility(official)',
 3: 'CompressNET Compression Process(Official)',
 5: 'Remote Job Entry',
 7: 'Echo Protocol',
 9: 'Discard Protocol',
 10: 'Wake-on-LAN',
 11: 'Active Users (systat service)',
 13: 'Daytime Protocol',
 15: 'Previously netstat service',
 17: 'Quote of the Day (QOTD)',
 18: 'Message Send Protocol',
 19: 'Character Generator Protocol (CHARGEN)',
 20: 'File Transfer Protocol (FTP) data transfer',
 21: 'File Transfer Protocol (FTP) control (command)',
 22: 'Secure Shell (SSH)',
 23: 'Telnet protocol—unencrypted text communications',
 25: 'Simple Mail Transfer Protocol (SMTP)',
 28: "Palo Alto Networks' Panorama High Availability (HA)",
 37: 'Time Protocol',
 42: 'Host Name Server Protocol',
 43: 'WHOIS protocol',
 49: 'TACACS Login Host protocol',
 51: 'Historically used for Interface Message Processor ',
 52: 'Xerox Network Syst

In [None]:
#Mapping Values
netflow['Src_value'] = netflow['sp'].map(Port_value)
netflow['Dst_value'] = netflow['dp'].map(Port_value)
netflow.Src_value = netflow.Src_value.fillna("Unassigned")
netflow.Dst_value = netflow.Dst_value.fillna("Unassigned")

5. Create features (Device_Src and Device_Dst) and assign device labels.

In [None]:
#Converting Host_Names and description into a dictionary
host2value = zip(Host_names['IP'], Host_names['Type'])
Host_description = list(host2value)
host_value = dict(Host_description)
host_value

{'44.149.43.129': 'Mikrotik-BGP-Router',
 '44.149.43.130': 'Mikrotik-Router',
 '44.149.43.131': 'Mikrotik-Router',
 '44.149.43.132': 'Linux-Ubuntu',
 '44.149.43.134': 'Linux-Raspbian',
 '44.149.43.136': 'Linux-Ubuntu',
 '44.149.43.193': 'Mikrotik-BGP-Router',
 '44.149.43.194': 'Mikrotik-Radio',
 '44.149.43.195': 'Mikrotik-Radio',
 '44.149.43.196': 'Mikrotik-Router',
 '44.149.43.197': 'SDR-RedPitaya',
 '44.149.43.198': 'Linux-Raspbian',
 '44.149.43.199': 'Linux-Debian',
 '44.149.43.200': 'Mikrotik-Router',
 '44.149.43.202': 'SDR-RedPitaya',
 '44.149.43.203': 'VOIP',
 '44.149.43.204': 'Linux-Ubuntu',
 '44.149.43.206': 'SDR-RedPitaya',
 '44.149.43.207': 'Linux-Ubuntu',
 '44.149.43.208': 'Linux-Debian',
 '44.149.43.209': 'Linux-Ubuntu',
 '44.149.43.210': 'Windows',
 '44.149.43.211': 'SDR-RedPitaya',
 '44.149.43.213': 'SDR-RedPitaya'}

In [None]:
#Mapping values
netflow['Device_Src'] = netflow['sa'].map(host_value)
netflow['Device_Dst'] = netflow['da'].map(host_value)
netflow['Device_Src'] = netflow['Device_Src'].fillna("Unknown_Device")
netflow['Device_Dst'] = netflow['Device_Dst'].fillna("Unknown_Device")

In [None]:
netflow.shape

6. Divide dataset containg datastreams of "Host-class-Devices" adn "Unknown-Devices".

In [None]:
# To filter NetFlow containing flows of "Unknown-Devices"
netflow_unknown = netflow[netflow['Device_Src'] == 'Unknown_Device']

In [None]:
netflow_unknown.shape

In [None]:
netflow_unknown.to_csv('netflow_unknown.csv', index= False)

In [None]:
#To filter NetFlow containing flows of "Host-Class-Devices"
netflow_host_class = netflow[netflow['Device_Src'] != 'Unknown_Device']

In [None]:
netflow_host_class.shape

In [None]:
netflow_host_class.to_csv('testdata-1-hour.csv', index= False)