# Analysis


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Loaded in

In [2]:
# full 30min dataset for streaming video on vpn
yes = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102.csv")
# full 30min dataset for web browsing on vpn
no = pd.read_csv("apristin-novideo-vpn-windows-20201109.csv")

# 5 minute chunks of streaming video on vpn
yes_0 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-0.csv")
yes_1 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-1.csv")
yes_2 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-2.csv")
yes_3 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-3.csv")
yes_4 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-4.csv")
yes_5 = pd.read_csv("apristin-youtube[1440p60]-1x-vpn-windows-noisy-20201102-5.csv")
# 5 minute chuncks of web browsing on vpn
no_0 = pd.read_csv("apristin-novideo-vpn-windows-20201109-0.csv")
no_1 = pd.read_csv("apristin-novideo-vpn-windows-20201109-1.csv")
no_2 = pd.read_csv("apristin-novideo-vpn-windows-20201109-2.csv")
no_3 = pd.read_csv("apristin-novideo-vpn-windows-20201109-3.csv")
no_4 = pd.read_csv("apristin-novideo-vpn-windows-20201109-4.csv")
no_5 = pd.read_csv("apristin-novideo-vpn-windows-20201109-5.csv")

### Now that I have the function below that cleans the extended columns and creates individual dataframes for each row's extended columns in the original dataset, I can further examine the data and engineer new features.

In [3]:
# This function returns a dataframe with the packets times, sizes, and directions for a single row of data
def three_cols(row):
    time = list(map(int, row['packet_times'].split(';')[0:-1]))
    size = list(map(int, row['packet_sizes'].split(';')[0:-1]))
    dirs = list(map(int, row['packet_dirs'].split(';')[0:-1]))
    dict1 = {'packet_time': time, 'packet_size': size, 'packet_dir': dirs}
    return pd.DataFrame(dict1)

# Packet Size Analysis

### Since the majority of packet sizes range from 0-300 bytes and 1200-1500 bytes, I think that developing a feature on the counts of those ranges can be beneficial in discovering if streaming is occuring.

In [18]:
# This function takes all the counts of the 0-300bytes for the 1->2 Direction and all the counts
# of the 1200-1500bytes for the 2->1 Direction and creates new columns based off this in the dataframe
def big_byte_count_feature(dataset):        
    df = dataset.copy()
    packet_size_count1 = []
    packet_size_count2 = []
    for i in range(df.shape[0]):
        row = three_cols(df.iloc[i])
        ones = row.loc[row['packet_dir'] == 1]['packet_size']
        twos = row.loc[row['packet_dir'] == 2]['packet_size']
        one_count=0
        two_count=0
        for packet in ones:
            if (int(packet) >= 0) and (int(packet) <= 300):
                one_count += 1
        for packet in twos:
            if (int(packet) >= 1200) and (int(packet) <= 1500):
                two_count += 1
        packet_size_count1.append(one_count)
        packet_size_count2.append(two_count)
    return df.assign(Dir1_ByteCount_0to300 = packet_size_count1, Dir2_ByteCount_1200to1500 = packet_size_count2)

### The way that I would use the two features I created above in my machine learning model would be by looking at the counts of the byte ranges and training my model on this since there is a clear divide between the range count of streaming vs non-streaming, as the count of the ranges are significantly higher when streaming is occuring. This is evident in my examples below for no streaming vs streaming.

In [27]:
#Example of byte count feature extraction on the no streaming dataset:
big_byte_no_streaming = big_byte_count_feature(yes_0)
print(big_byte_no_streaming[['Dir1_ByteCount_0to300', 'Dir2_ByteCount_1200to1500']].head(5))
print("No Streaming Byte Count:")
big_byte_no_streaming[['Dir1_ByteCount_0to300', 'Dir2_ByteCount_1200to1500']].sum()

   Dir1_ByteCount_0to300  Dir2_ByteCount_1200to1500
0                   1288                       2232
1                    709                       1402
2                   1646                       3124
3                    585                        876
4                      2                          0
No Streaming Byte Count:


Dir1_ByteCount_0to300        31958
Dir2_ByteCount_1200to1500    68755
dtype: int64

In [20]:
#Example of byte count feature extraction on the streaming dataset:
big_byte_streaming = big_byte_count_feature(yes)
print(big_byte_streaming[['Dir1_ByteCount_0to300', 'Dir2_ByteCount_1200to1500']].head(5))
print("Streaming Byte Count:")
big_byte_streaming[['Dir1_ByteCount_0to300', 'Dir2_ByteCount_1200to1500']].sum()

   Dir1_ByteCount_0to300  Dir2_ByteCount_1200to1500
0                   1288                       2232
1                    709                       1402
2                   1646                       3124
3                    585                        876
4                      2                          0
Streaming Byte Count:


Dir1_ByteCount_0to300        194108
Dir2_ByteCount_1200to1500    430519
dtype: int64