In [29]:
import scapy.all as sp
from scapy.all import rdpcap, TCP, IP
from decimal import Decimal, getcontext
import numpy as np
import pandas as pd
import glob
import os
import dnslib
import concurrent.futures

## Iterating through a pcap trace

In [4]:
pcap_file = "RequetDataSet/A/PCAP_FILES/baseline_Jan17_exp_30.pcap"
# Load the pcap into memory
trace = sp.rdpcap(pcap_file)
# Iterate over the first ten packets
for packet in trace[:1]:
  # Print the IP HEADER
  if packet.haslayer(sp.IP):
    packet.show()
  if sp.TCP in packet:
    print(str(packet[sp.TCP])[:4*packet[sp.TCP].dataofs])

# Try print the TCP Header Only

###[ Ethernet ]### 
  dst       = b8:8d:12:10:aa:8e
  src       = 14:91:82:29:3b:57
  type      = IPv4
###[ IP ]### 
     version   = 4
     ihl       = 5
     tos       = 0x0
     len       = 60
     id        = 0
     flags     = DF
     frag      = 0
     ttl       = 64
     proto     = tcp
     chksum    = 0xb6ac
     src       = 192.168.1.1
     dst       = 192.168.1.190
     \options   \
###[ TCP ]### 
        sport     = microsoft_ds
        dport     = 52202
        seq       = 1667711449
        ack       = 3063488696
        dataofs   = 10
        reserved  = 0
        flags     = SAE
        window    = 14480
        chksum    = 0xba18
        urgptr    = 0
        options   = [('MSS', 1460), ('SAckOK', b''), ('Timestamp', (6670394, 1021018435)), ('NOP', None), ('WScale', b'')]

TCP 192.168.1.1:microsoft_ds > 192.168.1


In [6]:
with sp.PcapReader(pcap_file) as trace:
    count = 0
    for packet in trace:
        count += 1
    print(f"Total packets read: {count}")

Total packets read: 111795


In [18]:
with sp.PcapReader(pcap_file) as trace:
    for packet in trace:
        if packet.haslayer(sp.UDP) and packet[sp.UDP].sport == 53:
            print("DNS response packet found")
            break

    print("No DNS response packet")

DNS response packet found
No DNS response packet


In [52]:
import json
from decimal import Decimal

def feature_extract_txt(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()

        # Metrics Initialization
        filename = file_path.split('/')[-1]
        total_bytes = 0
        chunks = []
        inter_arrival_times = []
        last_time = 0.0

        # Process each line
        for line in lines:
            data = json.loads(line.strip())
            relative_time = float(data[0])  # End of the interval in seconds
            packets_sent = int(data[1])
            packets_received = int(data[2])
            bytes_sent = int(data[3])
            bytes_received = int(data[4])
            network_info = data[5]  # List of lists with network info
            playback_info = data[6]  # Playback info list

            # Compute total bytes sent in the session (assuming total bytes are the sum of bytes sent)
            total_bytes += bytes_sent

            # Chunk size here can be defined as the bytes sent in each interval
            chunks.append(bytes_sent)

            # Calculate inter-arrival times (assuming 'relative_time' is in seconds and represents the end of each 100ms interval)
            if last_time != 0:
                inter_arrival_time = relative_time - last_time
                inter_arrival_times.append(inter_arrival_time)
            last_time = relative_time

        # Calculate metrics
        average_bytes_per_chunk = np.mean(chunks) if chunks else 0
        max_bytes_per_chunk = np.max(chunks) if chunks else 0
        min_bytes_per_chunk = np.min(chunks) if chunks else 0
        std_bytes_per_chunk = np.std(chunks) if chunks else 0

        # Assuming no retransmissions in this format
        retransmitted_chunk_numbers = 0

        # Bitrate (bytes per second)
        total_time = relative_time  # last recorded relative_time
        bitrate = (total_bytes / total_time) * 8 if total_time > 0 else 0  # Convert bytes to bits for bitrate

        # Time-based metrics (in seconds)
        average_time_between_chunks = np.mean(inter_arrival_times) if inter_arrival_times else 0
        std_time_between_chunks = np.std(inter_arrival_times) if inter_arrival_times else 0

        # Return computed metrics
        return [
            filename, total_bytes,
            average_bytes_per_chunk, max_bytes_per_chunk, min_bytes_per_chunk, std_bytes_per_chunk,
            0, 0,  # Placeholder for average and std transfer time as it's not applicable here
            average_time_between_chunks, std_time_between_chunks,
            retransmitted_chunk_numbers, bitrate
        ]

    except Exception as e:
        print(f'Error handling file {filename}: {e}')
        return None


# Example usage:
features = feature_extract_txt('./RequetDataSet/A/MERGED_FILES/baseline_Jan17_exp_30_merged.txt')
print(features)


Error handling file baseline_Jan17_exp_30_merged.txt: Expecting ',' delimiter: line 1 column 29 (char 28)
None


## BASIC packets analysis

In [35]:
def feature_extract(pcap_file):

    def decimal_mean(values):
        """Calculate the mean of a list of Decimal values."""
        total = sum(values)
        count = Decimal(len(values))
        return total / count
    
    def decimal_std(values):
        """Calculate the mean of a list of Decimal values."""
        total = sum(values)
        count = Decimal(len(values))
        
        mean = total/ count
        var = sum((x - mean) ** 2 for x in values) / len(values)
        std = var.sqrt()
        return std
    
    filename = pcap_file.split('/')[-1]

    # Load the pcap file
    try: 
        packets = rdpcap(pcap_file)

        # Filter TCP packets related to video streaming
        tcp_packets = [pkt for pkt in packets if TCP in pkt and IP in pkt]

        # Define lists to store data for analysis
        chunk_sizes = []
        inter_arrival_times = []
        transfer_times = []
        retransmissions = set()

        total_bits = 0
        start_time = float('inf')
        end_time = float('-inf')

        # Tracking variables
        last_ack = 0
        last_seq = {}
        last_time = tcp_packets[0].time
        chunk_start_time = None

        # Analyzing packets
        for i, packet in enumerate(tcp_packets):
            if TCP in packet:
                total_bits += len(packet[
                                TCP].payload) * 8  # Convert bytes to bits
                start_time = min(start_time, packet.time)
                end_time = max(end_time, packet.time)
            
            seq = packet[TCP].seq
            ack = packet[TCP].ack
            length = len(packet[TCP].payload)
            time = packet.time

            # Track retransmissions
            if seq in last_seq and time - last_seq[seq] < 1:
                retransmissions.add(seq)

            last_seq[seq] = time

            # Track chunk sizes and transfer times
            if ack > last_ack:
                if chunk_start_time is not None:
                    transfer_time = time - chunk_start_time
                    transfer_times.append(transfer_time)
                
                chunk_size = ack - last_ack
                chunk_sizes.append(chunk_size)
                
                inter_arrival_time = time - last_time
                inter_arrival_times.append(inter_arrival_time)
                
                chunk_start_time = time
                last_time = time

            last_ack = ack

        # Calculate desired metrics
        total_bytes = sum(chunk_sizes)
        max_bytes_per_chunk = np.max(chunk_sizes) if chunk_sizes else 0
        min_bytes_per_chunk = np.min(chunk_sizes) if chunk_sizes else 0
        std_bytes_per_chunk = np.std(chunk_sizes),
        average_bytes_per_chunk = np.mean(chunk_sizes) if chunk_sizes else 0
        average_transfer_time = decimal_mean(transfer_times) if transfer_times else 0
        std_transfer_time = decimal_std(transfer_times) if transfer_times else 0
        average_time_between_chunks = decimal_mean(inter_arrival_times[1:]) if len(inter_arrival_times) > 1 else 0  # Skip first
        std_time_between_chunks = decimal_std(inter_arrival_times[1:]) if len(inter_arrival_times) > 1 else 0
        retransmitted_chunk_numbers = len(retransmissions)

        total_duration = end_time - start_time

        # Calculate the bitrate in bits per second
        if total_duration > 0:
            bitrate = total_bits / total_duration
        else:
            bitrate = 0
        
        
        if chunk_sizes:
            last_10_chunk_sizes = chunk_sizes[-10:]  # Get the last 10 or fewer chunk sizes
            avg_last_10 = np.mean(last_10_chunk_sizes)
            max_last_10 = np.max(last_10_chunk_sizes)
            min_last_10 = np.min(last_10_chunk_sizes)
            std_last_10 = np.std(last_10_chunk_sizes)
            
            
        perc_50, perc_75, perc_85, perc_90 = np.percentile(last_10_chunk_sizes, [50, 75, 85, 90])
            

        return [filename, total_bytes, 
                average_bytes_per_chunk, max_bytes_per_chunk, min_bytes_per_chunk, std_bytes_per_chunk, 
                average_transfer_time, std_transfer_time, 
                average_time_between_chunks, std_time_between_chunks, 
                retransmitted_chunk_numbers, bitrate,
                avg_last_10, max_last_10, min_last_10, std_last_10, 
                perc_50, perc_75, perc_85, perc_90]
    
    except Exception as e:
        print(f'Error handling file {filename}: {e}')
        return None
        

    # Print the results
    # print("Total Bytes:", total_bytes)
    # print("Average Bytes per Chunk:", average_bytes_per_chunk)
    # print("Average Chunk Transfer Time:", average_transfer_time)
    # print("Average Time Elapsed Between Chunks:", average_time_between_chunks)
    # print("Retransmitted Chunk Numbers:", retransmitted_chunk_numbers)
    # print(f"Estimated Video Bitrate: {bitrate} bits per second")

# Further handling to calculate GET request response time
# This requires identifying HTTP GET requests and their corresponding responses, which would involve deeper packet inspection and protocol-specific parsing.

In [43]:
Decimal(Decimal(36681662874368))

Decimal('36681662874368')

In [31]:
df = pd.DataFrame(columns=['name', 'total_bytes', 
                           'average_bytes_per_chunk', 'max_bytes_per_chunk', 'min_bytes_per_chunk', 'std_bytes_per_chunk', 
                           'average_transfer_time', 'std_transfer_time',
                           'average_time_between_chunks', 'std_time_between_chunks', 
                           'retransmitted_chunk_numbers', 'bitrate'])
        

In [39]:
files = glob.glob("PCAP_FILES/baseline_*.pcap")

In [40]:
files

['PCAP_FILES/baseline_Jan17_exp_28.pcap',
 'PCAP_FILES/baseline_Jan17_exp_30.pcap',
 'PCAP_FILES/baseline_Jan17_exp_31.pcap',
 'PCAP_FILES/baseline_Jan17_exp_32.pcap',
 'PCAP_FILES/baseline_Jan17_exp_33.pcap',
 'PCAP_FILES/baseline_Jan17_exp_34.pcap',
 'PCAP_FILES/baseline_Jan17_exp_36.pcap',
 'PCAP_FILES/baseline_Jan17_exp_37.pcap',
 'PCAP_FILES/baseline_Jan17_exp_38.pcap',
 'PCAP_FILES/baseline_Jan17_exp_39.pcap',
 'PCAP_FILES/baseline_Jan17_exp_40.pcap',
 'PCAP_FILES/baseline_Jan17_exp_41.pcap',
 'PCAP_FILES/baseline_Jan17_exp_42.pcap',
 'PCAP_FILES/baseline_Jan17_exp_43.pcap',
 'PCAP_FILES/baseline_Jan17_exp_44.pcap',
 'PCAP_FILES/baseline_Jan17_exp_45.pcap',
 'PCAP_FILES/baseline_Jan17_exp_47.pcap',
 'PCAP_FILES/baseline_Jan17_exp_48.pcap',
 'PCAP_FILES/baseline_Jan17_exp_49.pcap',
 'PCAP_FILES/baseline_Jan17_exp_50.pcap',
 'PCAP_FILES/baseline_Jan17_exp_51.pcap',
 'PCAP_FILES/baseline_Jan17_exp_52.pcap',
 'PCAP_FILES/baseline_Jan17_exp_53.pcap',
 'PCAP_FILES/baseline_Jan17_exp_54

In [42]:
feature_extract("PCAP_FILES/baseline_Jan17_exp_28.pcap")

['baseline_Jan17_exp_28.pcap',
 36681662874368,
 1424475277.6345773,
 4258675425,
 1,
 (859804874.4324368,),
 Decimal('0.02319903801941747572815533981'),
 Decimal('0.1436065685039621573197830741'),
 Decimal('0.02319903801941747572815533981'),
 Decimal('0.1436065685039621573197830741'),
 1043,
 Decimal('3492.511011121613018680525803')]

In [50]:
files = glob.glob("RequetDataSet/A/PCAP_FILES/baseline_*.pcap")

import os

num_cpus = os.cpu_count()

with concurrent.futures.ProcessPoolExecutor(max_workers=max(6, num_cpus // 2)) as executor:
# filename = './RequetDataSet/A/PCAP_FILES/baseline_Jan17_exp_30.pcap'
    # with open(filename, 'rb') as f: 
    results = list(executor.map(feature_extract, files))
    # row_list1.append(newline)

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [47]:
results

<generator object _chain_from_iterable_of_lists at 0x7fb92d45a7b0>

In [30]:
len(row_list1)

32

## Identifying the service type using DNS traffic

In [None]:
## DNS queries can be used to identify to which service a remote IP belongs to.

In [22]:
import scapy.all as sp
import dnslib

# YT for example
YT_DOMAINS = ["youtube"]

# READ THE PCAP AND INSERT INTO THIS ARRAY THE IP ADDRESS THAT BELONG TO YT
youtube_ips = []

pcap_file = "data/1659055888-c0.ipsecptp.tiered.pharos-ipsecptp-a-video.720.dash.http2.pcap"

with sp.PcapReader(pcap_file) as trace:
  for packet in trace:
    # DNS Packet
    if packet.haslayer(sp.UDP) and packet[sp.UDP].sport == 53:
      # Get DNS data
      raw = sp.raw(packet[sp.UDP].payload)
      # Process the DNS query
      dns = dnslib.DNSRecord.parse(raw)
      # Iterate over answers
      for a in dns.rr:
        # Check if it's a domain of interest
        question = str(a.rname)
        if any(s in question for s in youtube):
          # Check if it's an answer
          if a.rtype == 1 or a.rtype == 28:
            print("Query {} is a Youtube query. Appending IP {} to Youtube IPs".format(question, a.rdata))
            youtube_ips.append(str(a.rdata))

print("Youtube IPs: {}".format(youtube_ips))

Youtube IPs: []


In [7]:
import scapy.all as sp
import dnslib

all_domains_ips = {}

pcap_file = "data/1659055888-c0.ipsecptp.tiered.pharos-ipsecptp-a-video.720.dash.http2.pcap"

with sp.PcapReader(pcap_file) as trace:
    for packet in trace:
        if packet.haslayer(sp.UDP) and packet[sp.UDP].sport == 53:
            raw = sp.raw(packet[sp.UDP].payload)
            dns = dnslib.DNSRecord.parse(raw)
            for a in dns.rr:
                domain_name = str(a.rname)
                if a.rtype == 1 or a.rtype == 28:
                    if domain_name not in all_domains_ips:
                        all_domains_ips[domain_name] = []
                    all_domains_ips[domain_name].append(str(a.rdata))

for domain, ips in all_domains_ips.items():
    print(f"domain name：{domain}, IP addr: {ips}")

In [9]:
all_domains_ips

{}

## Collecting network counters

In [14]:
network_counters = {}

# You can use this dictionary to collect counters
def counters():
  return {"in_pkts": 0, "out_pkts": 0, "in_bytes": 0, "out_bytes": 0}

# header is called "length"
pcap_file = "data/1659055888-c0.ipsecptp.tiered.pharos-ipsecptp-a-video.720.dash.http2.pcap"

with sp.PcapReader(pcap_file) as trace:
    for packet in trace:
        if packet.haslayer(sp.IP):
            src_ip = packet[sp.IP].src
            dst_ip = packet[sp.IP].dst
            pkt_length = packet[sp.IP].len

            if src_ip not in network_counters:
                network_counters[src_ip] = counters()
                
            network_counters[src_ip]["out_pkts"] += 1
            network_counters[src_ip]["out_bytes"] += pkt_length

            if dst_ip not in network_counters:
                network_counters[dst_ip] = counters()

            network_counters[dst_ip]["in_pkts"] += 1
            network_counters[dst_ip]["in_bytes"] += pkt_length

for ip in network_counters:
  print("IP {} generated the following amout of traffic {}".format(ip, network_counters[ip]))

IP 10.0.1.1 generated the following amout of traffic {'in_pkts': 338, 'out_pkts': 672, 'in_bytes': 94040, 'out_bytes': 182176}
IP 10.0.13.1 generated the following amout of traffic {'in_pkts': 672, 'out_pkts': 338, 'in_bytes': 182176, 'out_bytes': 94040}
IP 10.0.3.1 generated the following amout of traffic {'in_pkts': 73991, 'out_pkts': 37991, 'in_bytes': 110695512, 'out_bytes': 2076816}
IP 10.0.15.1 generated the following amout of traffic {'in_pkts': 37991, 'out_pkts': 73991, 'in_bytes': 2076816, 'out_bytes': 110695512}


## Infer segment downloads

In [16]:
## init seg. tracking
def segment():
    return {"pkts": 0, "bytes": 0}

# Read pcap and records completed video segments for each IP
completed_video_segments = {}
ongoing_video_segment = {}

with sp.PcapReader(pcap_file) as trace:
    for packet in trace:
        # Process only TCP packets that carry a payload
        if packet.haslayer(sp.TCP) and packet.haslayer(sp.IP):
            ip_len = packet[sp.IP].len
            ip_header_len = 4 * packet[sp.IP].ihl
            tcp_header_len = 4 * packet[sp.TCP].dataofs
            payload_size = ip_len - ip_header_len - tcp_header_len

            if payload_size > 0:
                src_ip = packet[sp.IP].src
                dst_ip = packet[sp.IP].dst

                # Assume packets to a Youtube IP are video segment downloads
                # This is a simplified check and would normally require more robust identification logic
                if dst_ip in youtube_ips:  
                    if dst_ip not in ongoing_video_segment:
                        ongoing_video_segment[dst_ip] = segment()
                    
                    ongoing_video_segment[dst_ip]["pkts"] += 1
                    ongoing_video_segment[dst_ip]["bytes"] += payload_size

                    # Example condition to mark a segment as completed (e.g., based on packet counts or bytes)
                    if ongoing_video_segment[dst_ip]["pkts"] >= 100:  # This threshold is arbitrary
                        if dst_ip not in completed_video_segments:
                            completed_video_segments[dst_ip] = []
                        completed_video_segments[dst_ip].append(ongoing_video_segment[dst_ip])
                        ongoing_video_segment[dst_ip] = segment()  # Reset for a new segment

# After capturing all segments, print the number of segments found for each Youtube IP
for ip in completed_video_segments.keys():
    print(f"IP {ip} downloaded {len(completed_video_segments[ip])} segments")

In [19]:
completed_video_segments, youtube_ips

({}, [])