In [1]:
import os
import sys
import re

import numpy as np
import pandas as pd
import scipy as sp
from scipy import signal

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [10]:
sys.path.insert(0, '../src')

from utils import *
from features import *

In [3]:
sns.set(rc={'figure.figsize':(20,5)})

In [4]:
mbit_rate = 1/125000

low_fp = '../data/240p/' 
med_fp = '../data/480p/'
high_fp = '../data/1080p/' 

In [5]:
stdoan_low = pd.read_csv(low_fp + 'stdoan-101-action-240p-20201127.csv')
stdoan_med = pd.read_csv(med_fp + 'stdoan-101-action-480p-20201127.csv')
stdoan_high = pd.read_csv(high_fp + 'stdoan-101-action-1080p-20201127.csv')

In [6]:
low_ms = convert_ms_df(stdoan_low, True)
med_ms = convert_ms_df(stdoan_med, True)
high_ms = convert_ms_df(stdoan_high, True)

In [7]:
low_resample = low_ms.resample('500ms', on='Time').sum()
med_resample = med_ms.resample('500ms', on='Time').sum()
high_resample = high_ms.resample('500ms', on='Time').sum()

## Aggregate Features

In [10]:
## take the aggregate features of the whole chunk; download and upload
def agg_feat(df, col):
  return [np.mean(df[col]), np.std(df[col])]

## take the ratio of upload:download packets
def pkt_ratio(df):
  ms_df = convert_ms_df(df, True)
  local = np.sum(ms_df['pkt_src'] == '1') 
  server = np.sum(ms_df['pkt_src'] == '2') 
  return local / server

## take the ratio of upload:download bytes
def bytes_ratio(df):
  local = df['1->2Bytes'].sum()
  server = df['2->1Bytes'].sum()
  return local / server

In [11]:
os.listdir(low_fp)

['imnemato-104-action-240p-20210110.csv',
 'imnemato-105-still-240p-20210112.csv',
 'saqian-102-action-240p-20210113.csv',
 'sgs008-106-action-240p-20210112.csv',
 'sgs008-107-still-240p-20210112.csv',
 'shs214-103-still-240p-20210109.csv',
 'shs214-108-action-240p-20210112.csv',
 'stdoan-101-action-240p-20201127.csv',
 'stdoan-102-action-240p-20201206.csv']

In [None]:
# 300 bytes = small pkt
# 

## Peak Related Aggregate Features

In [12]:
## finds the peaks with mean + 2(1) std
## run the above aggregate functions on the peaks only??

def get_peak_loc(df, col, invert=False):
  'invert arg allows you to get values not considered peaks'
  df_avg = df[col].mean()
  df_std = df[col].std()
  
  threshold = df_avg + (1 * df_std)
  
  if invert:
    return np.array(df[col] < threshold)
  
  else:
    return np.array(df[col] > threshold)

## np.mean, np.var, np.std - think of more?  
def peak_time_diff(df, col):
  '''
  mess around with the different inputs for function. 
  variance seems to inflate the difference betweent the two the most with litte
  to no data manipulation. however, currently trying things like
  squaring the data before taking the aggregate function to exaggerate
  differences (moderate success??)
  '''
  peaks = df[get_peak_loc(df, col)]
  peaks['Time'] = peaks['Time'] - peaks['Time'].min()
  time_diff = np.diff(peaks['Time'] ** 2)
  return [np.mean(time_diff), np.var(time_diff), np.std(time_diff)]

## Spectral Features

In [13]:
def spectral_features(df, col):

    """
    welch implemention of spectral features
    resample the data before inputting (might change prereq depending on
    resource allocation)
    """

    f, Pxx_den = sp.signal.welch(df[col], fs=2)
    Pxx_den = np.sqrt(Pxx_den)

    peaks = sp.signal.find_peaks(Pxx_den)[0]
    prominences = sp.signal.peak_prominences(Pxx_den, peaks)[0]

    idx_max = prominences.argmax()
    loc_max = peaks[idx_max]

    return [f[loc_max], Pxx_den[loc_max], prominences[idx_max]]

## Chunking & Feature creation

In [14]:
## wip; need to decide chunk size eventually
## should we also make this chunking feature be our feature creation?

def chunk_data(df, interval=60):

    """
    takes in a filepath to the data you want to chunk and feature engineer
    chunks our data into a specified time interval
    each chunk is then turned into an observation to be fed into our classifier
    """

    df_list = []
    
    df['Time'] = df['Time'] - df['Time'].min()
    
    total_chunks = np.floor(df['Time'].max() / interval).astype(int)

    for chunk in np.arange(total_chunks):
      
        start = chunk * interval
        end = (chunk+1) * interval

        temp_df = (df[(df['Time'] >= start) & (df['Time'] < end)])
        
        df_list.append(temp_df)
        
    return df_list

In [15]:
def create_features(df, interval=60):

  features = [
    'dwl_peak_freq',
    'dwl_peak_prom',
    'dwl_max_psd',
    'dwl_bytes_avg',
    'dwl_bytes_std',
    'dwl_peak_avg',
    'dwl_peak_var',
    'dwl_peak_std',
    'upl_peak_freq',
    'upl_peak_prom',
    'upl_max_psd',
    'upl_bytes_avg',
    'upl_bytes_std',
    'upl_peak_avg',
    'upl_peak_var',
    'upl_peak_std'
  ]  

  vals = []

  df_chunks = chunk_data(df, interval)

  for chunk in df_chunks:

    preproc = convert_ms_df(chunk, True)
    upl_bytes = preproc[preproc['pkt_src'] == '1'].resample('500ms', on='Time').sum()
    dwl_bytes = preproc[preproc['pkt_src'] == '2'].resample('500ms', on='Time').sum()

    ## spectral features
    dwl_spectral = spectral_features(dwl_bytes, 'pkt_size')
    upl_spectral = spectral_features(upl_bytes, 'pkt_size')
    
    ## aggregate features
    dwl_agg = agg_feat(chunk, '2->1Bytes')
    upl_agg = agg_feat(chunk, '1->2Bytes')
    
    ## peak features
    dwl_peak = peak_time_diff(chunk, '2->1Bytes')
    upl_peak = peak_time_diff(chunk, '1->2Bytes')
    
    feat_val = np.hstack((
      dwl_spectral,
      dwl_agg,
      dwl_peak,
      upl_spectral,
      upl_agg,
      upl_peak
    ))
    
    vals.append(feat_val)
    
  return pd.DataFrame(columns=features, data=vals).fillna(0)

In [50]:
def create_features_no_split(df, interval=60):

  features = [
    'peak_freq',
    'peak_prom',
    'max_psd',
    'bytes_avg',
    'bytes_std',
    'peak_avg',
    'peak_var',
    'peak_std',
  ]  

  vals = []

  df_chunks = chunk_data(df, interval)

  for chunk in df_chunks:

    preproc = convert_ms_df(chunk, True)

    ## spectral features
    spectral_feat = spectral_features(preproc, 'pkt_size')
    
    ## aggregate features
    aggr_feat = agg_feat(chunk, '2->1Bytes')
    
    ## peak features
    peak_feat = peak_time_diff(chunk, '2->1Bytes')
    
    feat_val = np.hstack((
      spectral_feat,
      aggr_feat,
      peak_feat
    ))
    
    vals.append(feat_val)
    
  return pd.DataFrame(columns=features, data=vals).fillna(0)

In [51]:
low_feat_no_split = create_features_no_split(stdoan_low, 100)

## Dev Playground

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score

In [11]:
%%time
low_feat = create_features(stdoan_low, 100)
med_feat = create_features(stdoan_med, 100)
high_feat = create_features(stdoan_high, 100)

Wall time: 17.4 s


In [14]:
low_feat['resolution'] = np.zeros(len(low_feat))
med_feat['resolution'] = np.zeros(len(med_feat)) + 1
high_feat['resolution'] = np.zeros(len(high_feat)) + 2

In [55]:
training_split = pd.concat([low_feat, med_feat, high_feat])

In [56]:
X, y = training_split.drop(columns=['resolution']), training_split['resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 8)

In [57]:
clf_split = RandomForestClassifier(n_estimators = 5, max_depth = 2, criterion = 'entropy', random_state = 42)
clf_split.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [61]:
y_pred = clf_split.predict(X_test)

In [62]:
np.abs(training_split.corr()['resolution']).sort_values(ascending=False)

resolution       1.000000
dwl_bytes_std    0.952209
upl_bytes_std    0.941935
dwl_max_psd      0.909083
upl_max_psd      0.908157
dwl_bytes_avg    0.900471
upl_peak_prom    0.900044
dwl_peak_prom    0.900041
upl_bytes_avg    0.889333
upl_peak_avg     0.517063
dwl_peak_avg     0.503176
upl_peak_std     0.498807
upl_peak_var     0.477132
dwl_peak_std     0.473051
dwl_peak_var     0.450026
dwl_peak_freq    0.442399
upl_peak_freq    0.261256
Name: resolution, dtype: float64

In [63]:
(pd.crosstab(y_test, y_pred, rownames=['Actual Resolution'], colnames=['Predicted Resolution']))

Predicted Resolution,0.0,1.0,2.0
Actual Resolution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,5,0,0
1.0,0,10,0
2.0,0,1,3


In [29]:
f1_score(y_test, y_pred, average=None)

array([1., 1., 1.])

In [64]:
%%time
low_feat_no_split = create_features_no_split(stdoan_low, 100)
med_feat_no_split = create_features_no_split(stdoan_med, 100)
high_feat_no_split = create_features_no_split(stdoan_high, 100)

Wall time: 12.5 s


In [65]:
low_feat_no_split['resolution'] = np.zeros(len(low_feat))
med_feat_no_split['resolution'] = np.zeros(len(med_feat)) + 1
high_feat_no_split['resolution'] = np.zeros(len(high_feat)) + 2

In [66]:
training_no_split = pd.concat([low_feat_no_split, med_feat_no_split, high_feat_no_split])

In [67]:
X, y = training_no_split.drop(columns=['resolution']), training_no_split['resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 8)

In [89]:
clf_no_split = RandomForestClassifier(n_estimators = 2, max_depth = 2, criterion = 'entropy', random_state = 42)
clf_no_split.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [90]:
y_pred = clf_no_split.predict(X_test)

In [91]:
np.abs(training_no_split.corr()['resolution']).sort_values(ascending=False)

resolution    1.000000
bytes_std     0.952209
bytes_avg     0.900471
max_psd       0.659674
peak_avg      0.503176
peak_std      0.473051
peak_var      0.450026
peak_freq     0.352513
peak_prom     0.286279
Name: resolution, dtype: float64

In [92]:
(pd.crosstab(y_test, y_pred, rownames=['Actual Resolution'], colnames=['Predicted Resolution']))

Predicted Resolution,0.0,1.0,2.0
Actual Resolution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4,1,0
1.0,6,4,0
2.0,1,2,1


In [93]:
f1_score(y_test, y_pred, average=None)

array([0.5       , 0.47058824, 0.4       ])