# Code to extract network protocol fields from a pcap file and write it in csv format for Data Analysis. Uses Python Scapy tool.

#### This should be run before Network_Traffic-Data_Analysis.ipynb notebook

###### Import scapy package to decode and extract protocol fields from pcap file.

In [1]:
import pandas as pd
import MyScapyExtract as myscap

#### Read the whole pcap file in to memory using scapy rdpcap(). 

The packet data analysed here is taken from 
https://www.netresec.com/?page=MACCDC   
File : maccdc2012_00000.pcap.gz

The original pcap file had to be split for a reduced size of 6 million packets instead of 8 million packets in the original file. Even with a reduced data of 6 million packets, this step takes more than a hour to complete. So please have patience and wait till the pcap file is read.

In [3]:
file0 = 'maccsplit_00000_20120316180000'

packets = myscap.scapy_read_packets(file0)

In [4]:
print(len(packets))
packets[0:1000]

6000000


<mod maccsplit_00000_20120316180000: TCP:978 UDP:7 ICMP:2 Other:13>

In [5]:
len(packets)

6000000

In [6]:
#import datetime
datalst = myscap.parse_scapy_packets(packets)

In [7]:
len(datalst)

6000000

In [8]:
datalst[0:2]

[{'id': '0',
  'len': '117',
  'timestamp': '2012-03-16 18:00:00.000000',
  'esrc': '00:16:47:9d:f2:c2',
  'edst': '00:0c:29:41:4b:e7',
  'etype': '33024',
  'vlan': '120',
  'isrc': '192.168.229.254',
  'idst': '192.168.202.79',
  'iproto': '6',
  'iplen': '99',
  'ipttl': '254',
  'tsport': '443',
  'tdport': '46117',
  'twindow': '32768'},
 {'id': '1',
  'len': '269',
  'timestamp': '2012-03-16 18:00:00.000000',
  'esrc': '00:0c:29:41:4b:e7',
  'edst': '00:16:47:9d:f2:c2',
  'etype': '33024',
  'vlan': '120',
  'isrc': '192.168.202.79',
  'idst': '192.168.229.254',
  'iproto': '6',
  'iplen': '251',
  'ipttl': '64',
  'tsport': '46117',
  'tdport': '443',
  'twindow': '15544'}]

There was some error with snmp packet data when I tried to save the data as csv and read it back. So removing the snmp packets from the data list.

In [9]:
datalst = [dict_item for dict_item in datalst if 'snmpversion' not in dict_item]

In [10]:
datalst

[{'id': '0',
  'len': '117',
  'timestamp': '2012-03-16 18:00:00.000000',
  'esrc': '00:16:47:9d:f2:c2',
  'edst': '00:0c:29:41:4b:e7',
  'etype': '33024',
  'vlan': '120',
  'isrc': '192.168.229.254',
  'idst': '192.168.202.79',
  'iproto': '6',
  'iplen': '99',
  'ipttl': '254',
  'tsport': '443',
  'tdport': '46117',
  'twindow': '32768'},
 {'id': '1',
  'len': '269',
  'timestamp': '2012-03-16 18:00:00.000000',
  'esrc': '00:0c:29:41:4b:e7',
  'edst': '00:16:47:9d:f2:c2',
  'etype': '33024',
  'vlan': '120',
  'isrc': '192.168.202.79',
  'idst': '192.168.229.254',
  'iproto': '6',
  'iplen': '251',
  'ipttl': '64',
  'tsport': '46117',
  'tdport': '443',
  'twindow': '15544'},
 {'id': '2',
  'len': '70',
  'timestamp': '2012-03-16 18:00:00.000000',
  'esrc': '00:0c:29:41:4b:e7',
  'edst': '00:16:47:9d:f2:c2',
  'etype': '33024',
  'vlan': '120',
  'isrc': '192.168.202.79',
  'idst': '192.168.229.251',
  'iproto': '6',
  'iplen': '52',
  'ipttl': '64',
  'tsport': '50463',
  'tdport

##### Convert the data list in to pandas dataframe

In [12]:
#del datadf
#del packets
datadf = pd.DataFrame(datalst)

In [13]:
print(datadf.shape)
datadf.head(10)

(5998837, 39)


Unnamed: 0,id,len,timestamp,esrc,edst,etype,vlan,isrc,idst,iproto,...,bootpgiaddr,bootpchaddr,dhcpoptions,ntpmode,arpop,arpsrc,arpdst,arppsrc,arppdst,dnsopcode
0,0,117,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.254,192.168.202.79,6,...,,,,,,,,,,
1,1,269,2012-03-16 18:00:00.000000,00:0c:29:41:4b:e7,00:16:47:9d:f2:c2,33024,120,192.168.202.79,192.168.229.254,6,...,,,,,,,,,,
2,2,70,2012-03-16 18:00:00.000000,00:0c:29:41:4b:e7,00:16:47:9d:f2:c2,33024,120,192.168.202.79,192.168.229.251,6,...,,,,,,,,,,
3,3,70,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.254,192.168.202.79,6,...,,,,,,,,,,
4,4,78,2012-03-16 18:00:00.000000,00:0c:29:41:4b:e7,00:16:47:9d:f2:c2,33024,120,192.168.202.79,192.168.229.251,6,...,,,,,,,,,,
5,5,217,2012-03-16 18:00:00.000000,00:0c:29:41:4b:e7,00:16:47:9d:f2:c2,33024,120,192.168.202.79,192.168.229.153,6,...,,,,,,,,,,
6,6,70,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.251,192.168.202.79,6,...,,,,,,,,,,
7,7,178,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.254,192.168.202.79,6,...,,,,,,,,,,
8,8,82,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.251,192.168.202.79,6,...,,,,,,,,,,
9,9,120,2012-03-16 18:00:00.000000,00:16:47:9d:f2:c2,00:0c:29:41:4b:e7,33024,120,192.168.229.254,192.168.202.79,6,...,,,,,,,,,,


In [14]:
print(sorted(datadf.columns))

['arpdst', 'arpop', 'arppdst', 'arppsrc', 'arpsrc', 'bootpchaddr', 'bootpciaddr', 'bootpgiaddr', 'bootpop', 'bootpsiaddr', 'bootpyiaddr', 'dhcpoptions', 'dnsopcode', 'edst', 'esrc', 'etype', 'icmpcode', 'icmptype', 'id', 'idst', 'iperrordst', 'iperrorproto', 'iperrorsrc', 'iplen', 'iproto', 'ipttl', 'isrc', 'len', 'ntpmode', 'tdport', 'timestamp', 'tsport', 'twindow', 'uerrordst', 'uerrorsrc', 'ulen', 'utdport', 'utsport', 'vlan']


##### Write the data frame to a csv file, so that it can be read back for further analysis.

In [15]:

df_file = "pcaps.csv"

datadf.to_csv(df_file)

###### Please run Network_Traffic-Data_Analysis-1.ipynb after this