We use the [Requet dataset](https://github.com/Wimnet/RequetDataSet), in particular the "merged files" which summarize the connection and QoE information every 100 ms.

In [27]:
# Request gives us different datasets. In this notebook, we are using one
# of them
dataset_to_use = 'B1'

In [28]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff
import glob

## Load and preprocessing

Download the dataset

In [None]:
mydatasets_folder = "RequetDataSet-master"
dataset_folder=mydatasets_folder+'/'+dataset_to_use+'/MERGED_FILES/'
files=glob.glob(dataset_folder+'baseline_*_merged.txt')

print("Files found in folder : ", dataset_folder ,":", files)

df=pd.DataFrame()

i=1;

for file in files:
    outfilename = file+".csv"
    fin = open(file, "rt")
    fout = open(outfilename,"wt")
    # From https://pythonexamples.org/python-replace-string-in-file/
    for line in fin:
        fout.write(line.replace('[','').replace(']',''))
    fin.close()
    fout.close()
    df=pd.concat( [ df, pd.read_csv(outfilename, header=None) ], 
                ignore_index=True)
    print("Concatenating (",i,"/",len(files),"): ", file)
    i = i+1

More than 15 minutes when ran locally

We build the column names

In [4]:
######
#### INTERVAL INFO
######

# First, we have general information about the 100ms-wide interval.
colnames=['RelativeTime', 'PacketsSent', 'PacketsReceived', 'BytesSent', 'BytesReceived']



######
#### NET INFO
######
NUM_OF_NET_INFO = 26
# Then, we have some NetworkInfo, each one related to the connection of the client to 
# a server (only the top NUM_OF_NET_INFO are represented)

for interval in range(0,NUM_OF_NET_INFO) :
        int_str = str(interval)
        network_info=['IPSrc'+int_str, 'IPDst'+int_str, 'Protocol'+int_str, 
                      'PacketsSent'+int_str, 'PacketsReceived'+int_str, 
                      'BytesSent'+int_str, 'BytesReceived'+int_str]
        colnames.extend(network_info)

        
######
#### PLAYBACK INFO
######
playback_info_event = ['Buffering', 'Paused', 'Playing', 'CollectData'];
colnames.extend(playback_info_event);

playback_info_generic = ['EpochTime', 
                'StartTime', 'PlaybackProgress', 'Length'];
colnames.extend(playback_info_generic);

playback_info_quality = ['UnlabelledQuality', 'q144p', 'q240p', 
                 'q360p', 'q480p', 'q720p', 'q1080p', 'q1440p', 'q2160p'];
colnames.extend(playback_info_quality);

playback_info_health_and_progress = ['BufferHealth',
                'BufferProgress', 'BufferValid'];
colnames.extend(playback_info_health_and_progress)


# The correct number of columns should be calculated as follows
NUM_OF_INTERVAL_INFO_COLS = 5
NUM_OF_NET_INFO_COLS = 7

NUM_OF_GENERIC_PLAYBACK_EVENT_COLS = 4
NUM_OF_GENERIC_PLAYBACK_COLS = 4
NUM_OF_PLAYBACK_QUALITY_COLS = 9
NUM_OF_PLAYBACK_HEALTH_AND_PROGRESS_COLS = 3
NUM_OF_PLAYBCAK_COLS = NUM_OF_GENERIC_PLAYBACK_EVENT_COLS + \
    NUM_OF_GENERIC_PLAYBACK_COLS + \
    NUM_OF_PLAYBACK_QUALITY_COLS + NUM_OF_PLAYBACK_HEALTH_AND_PROGRESS_COLS
    
NUM_COLS = NUM_OF_INTERVAL_INFO_COLS + \
    NUM_OF_NET_INFO_COLS*NUM_OF_NET_INFO + NUM_OF_PLAYBCAK_COLS

if (len(colnames) !=  NUM_COLS):
    raise Exception("Expected number of columns ", NUM_COLS, ", constructed ",
                   len(colnames) );

In [5]:
df.columns=colnames

In [6]:
df.head()

Unnamed: 0,RelativeTime,PacketsSent,PacketsReceived,BytesSent,BytesReceived,IPSrc0,IPDst0,Protocol0,PacketsSent0,PacketsReceived0,...,q240p,q360p,q480p,q720p,q1080p,q1440p,q2160p,BufferHealth,BufferProgress,BufferValid
0,0.0,1,0,75,0,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-1
1,0.1,9,8,4140,3187,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-1
2,0.2,21,19,7263,14866,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-1
3,0.3,5,6,353,3223,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-1
4,0.4,3,3,367,2410,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-1


Remove the collect data event and unlabeled

In [15]:
df = df [df['CollectData']!=1]  #!why
df = df [df['UnlabelledQuality']!=1]

In [26]:
df.sample(10)

Unnamed: 0,RelativeTime,PacketsSent,PacketsReceived,BytesSent,BytesReceived,IPSrc0,IPDst0,Protocol0,PacketsSent0,PacketsReceived0,...,q480p,q720p,q1080p,q1440p,q2160p,BufferHealth,BufferProgress,BufferValid,avg_qual,KBytesReceived
66102,600.2,0,0,0,0,192.168.1.15,173.194.131.139,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.0
78140,1.0,1,3,70,1340,192.168.1.15,74.125.172.60,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,1.34
240439,3.9,33,233,4144,317762,192.168.1.15,173.194.131.139,QUIC,29,228,...,1,0,0,0,0,3.633641,0.005897,true,480,317.762
348582,0.2,0,0,0,0,192.168.1.15,173.194.61.76,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.0
306514,0.4,4,2,482,1489,192.168.1.15,74.125.0.58,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,1.489
102192,2.2,1,2,234,84,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.084
84166,2.6,2,4,140,592,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.592
324546,0.6,1,0,87,0,192.168.1.15,173.194.31.120,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.0
66099,599.9,0,0,0,0,192.168.1.15,173.194.131.139,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.0
258437,0.7,1,3,191,701,192.168.1.15,173.194.185.106,0,0,0,...,0,0,0,0,0,0.0,0.0,-1,0,0.701


In [16]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2002 entries, 0 to 360599
Data columns (total 209 columns):
 #    Column             Dtype  
---   ------             -----  
 0    RelativeTime       float64
 1    PacketsSent        int64  
 2    PacketsReceived    int64  
 3    BytesSent          int64  
 4    BytesReceived      int64  
 5    IPSrc0             object 
 6    IPDst0             object 
 7    Protocol0          object 
 8    PacketsSent0       int64  
 9    PacketsReceived0   int64  
 10   BytesSent0         int64  
 11   BytesReceived0     int64  
 12   IPSrc1             object 
 13   IPDst1             object 
 14   Protocol1          object 
 15   PacketsSent1       int64  
 16   PacketsReceived1   int64  
 17   BytesSent1         int64  
 18   BytesReceived1     int64  
 19   IPSrc2             object 
 20   IPDst2             object 
 21   Protocol2          object 
 22   PacketsSent2       int64  
 23   PacketsReceived2   int64  
 24   BytesSent2         int64  

In [17]:
df['avg_qual'] = (\
    df['q144p']*144 + df['q240p']*240 + df['q360p']*360 +\
    df['q480p']*480 + df['q720p']*720 + df['q1080p']*1080 + \
    df['q1440p']*1440 + df['q2160p']*2160 \
    ) * \
    (\
    df['q144p'] + df['q240p'] + df['q360p'] +\
    df['q480p'] + df['q720p'] + df['q1080p'] + \
    df['q1440p'] + df['q2160p']\
    );

df.head()

df['KBytesReceived'] = df['BytesReceived']/1000;


In [18]:
#df_small = df[['KBytesReceived','avg_qual']]
df_small = df[ df['KBytesReceived']>10 ]
df_small[['KBytesReceived','avg_qual']]

Unnamed: 0,KBytesReceived,avg_qual
2,14.866,0
9,21.888,0
6014,16.757,0
12030,16.544,0
12069,439.231,1080
...,...,...
348588,19.288,0
348623,173.786,1080
354593,15.507,0
354598,16.362,0


## Retrieve Labels

In [None]:
quality_columns=['UnlabelledQuality', 'q144p', 'q240p', 'q360p', 
                  'q480p', 'q720p', 'q1080p', 'q1440p', 'q2160p']

print("samples with quality info:\n", df_small[quality_columns].sum(axis=1))

df_no_quality = (df_small[quality_columns].sum(axis=1) - 1 )*(-1)
print("Samples with no quality info:\n", df_no_quality)

df_small['NoQualityInfo'] = df_no_quality



In [None]:
quality_columns.append('NoQualityInfo')


# https://stackoverflow.com/a/38334528/2110769
label = df_small[quality_columns].idxmax(axis=1)

df_small['label'] = label
df_small.head()

In [None]:
df_small = df_small.drop(columns=quality_columns, axis=1)
df_small = df_small.drop(columns=['avg_qual'], axis=1)
df_small.head()

Save pre-processed data

In [22]:
filename = 'df.'+dataset_to_use+'_1.csv'
df_small.to_csv(filename,index = None, header=True)
print(filename, " saved")


df.B1_1.csv  saved
