In [1]:
import os
import sys
import re

import numpy as np
import pandas as pd
import scipy as sp
from scipy import signal

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
sys.path.insert(0, '../src')

from utils import *

In [3]:
sns.set(rc={'figure.figsize':(20,5)})

In [4]:
mbit_rate = 1/125000

low_fp = '../data/240p/' 
med_fp = '../data/480p/'
high_fp = '../data/1080p/' 

In [5]:
stdoan_low = pd.read_csv(low_fp + 'stdoan-101-action-240p-20201127.csv')
stdoan_med = pd.read_csv(med_fp + 'stdoan-101-action-480p-20201127.csv')
stdoan_high = pd.read_csv(high_fp + 'stdoan-101-action-1080p-20201127.csv')

In [6]:
low_ms = convert_ms_df(stdoan_low, True)
med_ms = convert_ms_df(stdoan_med, True)
high_ms = convert_ms_df(stdoan_high, True)

In [7]:
low_resample = low_ms.resample('500ms', on='Time').sum()
med_resample = med_ms.resample('500ms', on='Time').sum()
high_resample = high_ms.resample('500ms', on='Time').sum()


## Aggregate Features

In [8]:
## take the average of the whole chunk; download and upload
def avg_bytes(df, col):
  return np.mean(df[col])

## take the std of whole chunk; download and upload
def std_bytes(df, col):
  return np.std(df[col])

## take the ratio of upload:download packets
def pkt_ratio(df):
  ms_df = convert_ms_df(df, True)
  local = np.sum(ms_df['pkt_src'] == '1') 
  server = np.sum(ms_df['pkt_src'] == '2') 
  return local / server

## take the ratio of upload:download bytes
def bytes_ratio(df):
  local = df['1->2Bytes'].sum()
  server = df['2->1Bytes'].sum()
  return local / server

## Peak Related Aggregate Features

In [40]:
## finds the peaks with mean + 2(1) std
## run the above aggregate functions on the peaks only??

def get_peak_loc(df, col, invert=False):
  'invert arg allows you to get values not considered peaks'
  df_avg = df[col].mean()
  df_std = df[col].std()
  
  threshold = df_avg #+ (1 * df_std)
  
  if invert:
    return np.array(df[col] < threshold)
  
  else:
    return np.array(df[col] > threshold)

def peak_time_diff(df, col='2->1Bytes', func=np.mean):
  '''
  mess around with the different inputs for function. 
  variance seems to inflate the difference betweent the two the most with litte
  to no data manipulation. however, currently trying things like
  squaring the data before taking the aggregate function to exaggerate
  differences (moderate success??)
  '''
  peaks = df[get_peak_loc(df, col)]
  peaks['Time'] = peaks['Time'] - peaks['Time'].min()
  time_diff = np.diff(peaks['Time'] ** 2)
  return func(time_diff)

## Spectral Features

In [41]:
def spectral_features(df, col):

    """
    welch implemention of spectral features
    resample the data before inputting (might change prereq depending on
    resource allocation)
    """

    f, Pxx_den = sp.signal.welch(df[col], fs=2)
    Pxx_den = np.sqrt(Pxx_den)

    peaks = sp.signal.find_peaks(Pxx_den)[0]
    prominences = sp.signal.peak_prominences(Pxx_den, peaks)[0]

    idx_max = prominences.argmax()
    loc_max = peaks[idx_max]

    return [f[loc_max], Pxx_den[loc_max], prominences[idx_max]]

## Chunking & Feature creation

In [10]:
## wip; need to decide chunk size eventually
## should we also make this chunking feature be our feature creation?

def chunk_data(df, interval=60):

    """
    takes in a filepath to the data you want to chunk and feature engineer
    chunks our data into a specified time interval
    each chunk is then turned into an observation to be fed into our classifier
    """

    df_list = []
    
    df['Time'] = df['Time'] - df['Time'].min()
    
    total_chunks = np.floor(df['Time'].max() / interval).astype(int)

    for chunk in np.arange(total_chunks):
      
        start = chunk * interval
        end = (chunk+1) * interval

        temp_df = (df[(df['Time'] >= start) & (df['Time'] < end)])
        
        df_list.append(temp_df)
        
    return df_list

In [12]:
%%time
low_chunks = chunk_data_temp(stdoan_low)
med_chunks = chunk_data_temp(stdoan_med)
high_chunks = chunk_data_temp(stdoan_high)

Wall time: 234 ms


In [34]:
low_lst_mean = []
low_lst_var = []

for temp_df in low_chunks:
  low_lst_mean.append(peak_time_diff(temp_df))
  low_lst_var.append(peak_time_diff(temp_df, func=np.std))

In [35]:
med_lst_mean = []
med_lst_var = []

for temp_df in med_chunks:
  med_lst_mean.append(peak_time_diff(temp_df))
  med_lst_var.append(peak_time_diff(temp_df, func=np.std))

In [36]:
high_lst_mean = []
high_lst_var = []

for temp_df in high_chunks:
  high_lst_mean.append(peak_time_diff(temp_df))
  high_lst_var.append(peak_time_diff(temp_df, func=np.std))