In [1]:
# import libraries
import csv
import pandas as pd
import re
from pandas.core.frame import DataFrame
from pathlib import Path
import os
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from statsmodels import robust
import json

In [2]:
def server_check(file_name):
    '''
    This function can be used to determine the name of the server.
    '''
    
    server = 'www.superhappy.com'
    
    if 'nameserver1' in file_name:
        server = 'e.ext.nic.fr'
    elif 'nameserver2' in file_name:
        server = 'f.ext.nic.fr'
    elif 'nameserver3' in file_name:
        server = 'g.ext.nic.fr'
        
    elif 'res1' in file_name:
        server = 'lej-de.ark.caida.org'
    elif 'res2' in file_name:
        server = 'per-au.ark.caida.orgf'
    elif 'res3' in file_name:
        server = 'san-us.ark.caida.org'
        
    elif 'iperf1' in file_name or 'th1' in file_name:
        server = 'ok1.iperf.comnet-student.eu'
    elif 'iperf2' in file_name or 'th2' in file_name:
        server = 'blr1.iperf.comnet-student.eu'
    
    return server

## pre-processing
Latency (data sets AS1.x), where x includes: <br>
3 name servers with DNS (d1, d2, d3) ,ICMP (n1, n2, n3),  <br>
3 research servers (r1, r2, r3) <br>
2 iperf servers (i1, i2). <br>

In [3]:
# Some changes have been made to the original code from assignment1 demo
def from_pings(file_name, output):
    
    # Firstly, read the file
    f = open("t3_data/"+file_name,'r')
    
    # Create some lists to store the desired values
    server_list = [] 
    timestamp_list = [] 
    typess = []
    seqs = []
    ttls = []
    rtts = []
    
    # Read through all lines
    for line in f:
        if '64 bytes from' in line: 
            time = re.split(' |=',line)[0][1:-1]
            seq = re.split(' |=',line)[7]
            ttl = re.split(' |=',line)[9]
            rtt = re.split(' |=',line)[11]
            timestamp_list.append(time)
            seqs.append(seq)
            ttls.append(ttl)
            rtts.append(rtt)
            server_list.append(server_check(file_name))
            typess.append('ping')
        
        if 'no answer yet' in line:
            time = re.split(' |=',line)[0][1:-1]
            seq = re.split(' |=',line)[6]
            ttl = float("+inf")
            rtt = float("+inf") # Use inf to indicate packet loss 
            timestamp_list.append(time)
            seqs.append(seq)
            ttls.append(ttl)
            rtts.append(rtt)
            server_list.append(server_check(file_name))
            typess.append('ping')

    # Convert the lists to Dataframe
    data = {"Timestamp": timestamp_list,
            "Target": server_list,
            "Type of measurement": typess,
            "ICMP seq": seqs,
            "TTLs": ttls,
            "RTTs (ms)": rtts}

    df = DataFrame(data)
    df = df.sort_values(by=["Timestamp"])
    
    # Write DataFrame to a csv file
    df.to_csv(output, index=False)

In [4]:
def from_dig(file_name, output):
        
    # Firstly, read the file
    f = open("t3_data/"+file_name,'r')
    
    # Create some lists to store the desired values
    server_list = [] 
    timestamp_list = [] 
    typess = []
    query_times = []
    
    # Read through all lines
    for line in f:
        if 'Start' in line: 
            time = re.split(' ',line)[1]
            timestamp_list.append(time)
        
        if 'Query time' in line:
            query_time = re.split(' ',line)[3]
            query_times.append(query_time)
            server_list.append(server_check(file_name))
            typess.append('dns')

    # Convert the lists to Dataframe
    data = {"Timestamp": timestamp_list,
            "Target": server_list,
            "Type of measurement": typess,
            "Query time (ms)": query_times}

    df = DataFrame(data)
    df = df.sort_values(by=["Timestamp"])
    
    # Write DataFrame to a csv file
    df.to_csv(output, index=False)

In [5]:
def from_curl(file_name, output):
            
    # Firstly, read the file
    f = open("t3_data/"+file_name,'r')
    
    # Create some lists to store the desired values
    server_list = [] 
    timestamp_list = [] 
    typess = []
    download_speed = []
    tcp_time = []
    size_download = []
    
    # Read through all lines
    for line in f:
        if 'Start' in line: 
            time = re.split(' ',line)[1]
            timestamp_list.append(time)
        
        else:
            tcp = float(re.split(',',line)[1]) - float(re.split(',',line)[0]) # TCP latency got from time_connect minus time_namelookup
            speed = re.split(',',line)[4]
            size = re.split(',',line)[5]
            tcp_time.append(tcp)
            size_download.append(size)
            download_speed.append(float(speed) * 8)
            server_list.append(server_check(file_name))
            typess.append('http')

    # Convert the lists to Dataframe
    data = {"Timestamp": timestamp_list,
            "Target": server_list,
            "Type of measurement": typess,
            "TCP latency (s)": tcp_time,
            "Size download (byte)": size_download,
            "Download speed (bps)": download_speed}
    
    df = DataFrame(data)
    df = df.sort_values(by=["Timestamp"])
    
    # Write DataFrame to a csv file
    df.to_csv(output, index=False)

In [6]:
for file_name in os.listdir("./t3_data/"):
    if file_name.endswith('ping.txt'):  # Find all ‘ping’ files
        output = os.path.splitext(file_name)[0] + '.csv'
        from_pings(file_name, output)

In [7]:
for file_name in os.listdir("./t3_data/"):
    if file_name.endswith('dig.txt'):  # Find all ‘dig’ files
        output = os.path.splitext(file_name)[0] + '.csv'
        from_dig(file_name, output)

In [8]:
for file_name in os.listdir("./t3_data/"):
    if file_name.endswith('curl.txt'):  # Find all ‘curl’ files
        output = os.path.splitext(file_name)[0] + '.csv'
        from_curl(file_name, output)

In [9]:
def from_onefile(json_file):
    with open("t3_data/"+json_file) as json_file:
        try:
            f = json.load(json_file) # Read the file

            # Extract useful data
            timestamp = f['start']['timestamp']['timesecs']
            server = f['start']['connecting_to']['host']
            port = f['start']['connecting_to']['port']
            protocol = f['start']['test_start']['protocol']
            mode = f['start']['test_start']['reverse']

            sent_rate = f['end']['sum_sent']['bits_per_second']
            sent_byte = f['end']['sum_sent']['bytes']
            retransmission = f['end']['sum_sent']['retransmits']

            receive_rate = f['end']['sum_received']['bits_per_second']
            receive_byte = f['end']['sum_received']['bytes']

        except json.decoder.JSONDecodeError:
            pass        # invalid json file, just ignore

        except KeyError:            # tried to read non existent value
            timestamp = -1
            server = str(-1)
            port = str(-1)
            protocol = str(-1)
            mode = str(-1)

            sent_rate = str(-1)
            sent_byte = str(-1)
            retransmission = str(-1)

            receive_rate = str(-1)
            receive_byte = str(-1)

    return [timestamp,server,port,protocol,mode,sent_rate,sent_byte,retransmission,receive_rate,receive_byte]


def from_iperf(mode,output):
    
    # First, create a list of headers and a list of stored data
    headers = ["Timestamp","Server","Port","Type","Mode","Sent bitrate (bps)","Sent bytes","Retransmissions","Receive bitrate (bps)","Receive bytes"]
    all_in = []
    
    # Because there are many json files, we need to traverse them to extract all the required data
    for file_name in os.listdir("./t3_data/"):
        if file_name.startswith(mode) and file_name.endswith('.json'): # Find all ‘json’ files
            aone = from_onefile(file_name)
            all_in.append(aone)
    
    # From list to dataframe
    df = pd.DataFrame(all_in, columns=headers)
    df = df.sort_values(by=["Timestamp"])
    
    # From dataframe to csv
    df.to_csv(output, index=False)

In [10]:
from_iperf('one','th_one.csv')
from_iperf('two','th_two.csv')
from_iperf('rone','th_one_r.csv')
from_iperf('rtwo','th_two_r.csv')