# Analyze Pcap

## Imports & Setup

In [None]:
from datetime import timedelta
import pandas

from steelscript.netshark.core import NetShark
from steelscript.netshark.core.types import Value, Key
from steelscript.netshark.core.filters import NetSharkFilter, TimeFilter
from steelscript.common import UserAuth

from steelscript.common.datautils import bytes2human

In [None]:
# optional debug logging
import logging
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)s [%(levelname)-8.5s] (%(name)s) %(msg)s", level=logging.DEBUG)
logger.setLevel(logging.DEBUG)

In [None]:
hostname = "NETSHARK.HOSTNAME.COM"
username = "USERNAME"
password = "PASSWORD"

## Connect to the NetShark device

In [None]:
netshark = NetShark(hostname, auth=UserAuth(username, password))
netshark

## Pick the source

Choose the first running job.

In [None]:
jobs = netshark.get_capture_jobs()
source = None
for j in jobs:
    if j.get_status()['state'] == 'RUNNING':
        source = j
        break
source

In [None]:
timefilter = TimeFilter.parse_range('last 10s')
srvport = 443
portfilter = NetSharkFilter('tcp.server_port=%d' % srvport)

In [None]:
export = netshark.create_export(source, timefilter, filters=[portfilter])

Note that the export details does not represent the port 22 filter - it really represents the maximum possible size of the resulting packet set until the export is actually used below

In [None]:
export.details()

The `export` object is used to download the packets locally to disk.  Once the download occurs, the export object is deleted on the NetShark side of things.

In [None]:
export.download('/tmp/telnet.pcap', overwrite=True)

## Analyze a pcap file using Wireshark

(or more precisely, tshark -- the command line version of wireshark)

The module `steelscript.wireshark` provides a Python wrapper around tshark, making it easy to inspect and perform queries on pcap files.

In [None]:
from steelscript.wireshark.core.pcap import PcapFile

pcap = PcapFile('/tmp/telnet.pcap')
pcap.info()
print pcap.starttime
print pcap.endtime
print pcap.numpackets

The `TSharkFields` class allows for searching for fields by name or protocol:

In [None]:
from steelscript.wireshark.core.pcap import TSharkFields
fields = TSharkFields()

In [None]:
fields.find(name_re='^tcp.*rtt')

In [None]:
fields.find(name_re='^tcp.*flight')

## Query the pcap file for a set of packet fields

In [None]:
df = pcap.query(['frame.time_epoch', 'ip.src', 'ip.dst', 'ip.len', 
                 'tcp.srcport', 'tcp.dstport', 
                 'tcp.flags', 'tcp.analysis.ack_rtt'], 
                 filterexpr='tcp',
                 as_dataframe=True)
df[:10]

### Filter for rows that have an RTT

In [None]:
has_rtt = (~df['tcp.analysis.ack_rtt'].isnull())
df[has_rtt][:10]

### Assign client/server IP based on port

In [None]:
# Assign the column dst to:
#    src_true for all rows where expr is True
#    src_false for all rows where expr is False

def assign_col(df, expr, dst, src_true, src_false):
    df.ix[expr, dst]  = df.ix[expr, src_true]
    df.ix[~expr, dst] = df.ix[~expr, src_false]    

In [None]:
# This boolean expression assigns a True/False value for each row 
# based on the tcp.srcport.  Since we know the filter was on port 22
# than if the srcport is 22, the source is the server.  Otherwise
# the dest is the server.
srv_src = (df['tcp.srcport'] == 22)

srv_src[:10]

### Now we can use this "expression" to set the client/server IP and port.

In [None]:
assign_col(df, srv_src, 'ip.cli',      'ip.dst',      'ip.src')
assign_col(df, srv_src, 'ip.srv',      'ip.src',      'ip.dst')
assign_col(df, srv_src, 'tcp.cliport', 'tcp.dstport', 'tcp.srcport')

df = df.ix[:,['frame.time_epoch', 'ip.srv', 'ip.cli', 'tcp.cliport', 
              'ip.len', 'tcp.flags', 'tcp.analysis.ack_rtt']]
df[:10]

### Group by conversation

In this case a unique conversation defined by the triplet: Client/Server IP and Client Port (since the server port is always 22, we don't bother)

In [None]:
gb = df.groupby(['ip.cli', 'ip.srv', 'tcp.cliport'])

The `gb` object itself is not an interesting result -- instead, you then frequently use it to aggregate rows that have the same index (the triplet)

In [None]:
agg = gb.aggregate({'ip.len': 'sum',
                    'tcp.analysis.ack_rtt': ['min', 'mean', 'max', 'count']})
agg

### Lets plot some of this

In [None]:
%pylab inline

Compute the index of the row in the aggregate data that has the highest number of samples

In [None]:
rtt_index = agg.sort(columns=[('tcp.analysis.ack_rtt', 'count')], ascending=False).index[0]

Now index into the original `df` using the `rtt_index` -- this allows us to filter on only packets that were assocaited with the client/server/port index value.

In [None]:
clisrv = df.set_index(agg.index.names)
s = clisrv[clisrv.index.isin([rtt_index])]

Index this filtered data set on time and graph packet size (ie. `ip.len`)

In [None]:
s = s.set_index('frame.time_epoch')
s.plot(y=['ip.len'])

Now plot of few other interesting metrics, like computing bit rate -- the sum of ip.len across all rows in a 1 second interval:

In [None]:
s.resample('1s', {'ip.len': 'sum'}).plot()

In [None]:
s.plot(y=['tcp.analysis.ack_rtt'])

In [None]:
s.resample('100L', {'tcp.analysis.ack_rtt': ['min', 'mean', 'max']}).plot()