In [1]:
import os
import sys
import re

import numpy as np
import pandas as pd
import scipy as sp
from scipy import signal

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
sys.path.insert(0, '../src')
from features import *
from utils import *

In [3]:
sns.set(rc={'figure.figsize':(20,5)})

In [4]:
mbit_rate = 1/125000

low_fp = '../data/240p/' 
threesixty_fp = '../data/360p/' 
med_fp = '../data/480p/'
seventwenty_fp = '../data/720p/' 
high_fp = '../data/1080p/' 

In [5]:
low_dfs = []
for file in os.listdir(low_fp):
    if file != '.ipynb_checkpoints' and file != '.DS_Store':
        low_dfs.append(pd.read_csv(low_fp+file))
    
threesixty_dfs = []
for file in os.listdir(threesixty_fp):
    if file != '.ipynb_checkpoints' and file != '.DS_Store':
        threesixty_dfs.append(pd.read_csv(threesixty_fp+file))
        
med_dfs = []
for file in os.listdir(med_fp):
    if file != '.ipynb_checkpoints' and file != '.DS_Store':
        med_dfs.append(pd.read_csv(med_fp+file))
        
seventwenty_dfs = []
for file in os.listdir(seventwenty_fp):
    if file != '.ipynb_checkpoints' and file != '.DS_Store':
        seventwenty_dfs.append(pd.read_csv(seventwenty_fp+file))
    
high_dfs = []
for file in os.listdir(high_fp):
    if file != '.ipynb_checkpoints' and file != '.DS_Store':
        high_dfs.append(pd.read_csv(high_fp+file))

In [6]:
# stdoan_low = pd.read_csv(low_fp + 'stdoan-101-action-240p-20201127.csv')
# stdoan_med = pd.read_csv(med_fp + 'stdoan-101-action-480p-20201127.csv')
# stdoan_high = pd.read_csv(high_fp + 'stdoan-101-action-1080p-20201127.csv')

In [7]:
low_ms = []
for df in low_dfs:
    low_ms.append(convert_ms_df(df,True))
    
threesixty_ms = []
for df in threesixty_dfs:
    threesixty_ms.append(convert_ms_df(df,True))
    
med_ms = []
for df in med_dfs:
    med_ms.append(convert_ms_df(df,True))
    
seventwenty_ms = []
for df in seventwenty_dfs:
    seventwenty_ms.append(convert_ms_df(df,True))
    
high_ms = []
for df in high_dfs:
    high_ms.append(convert_ms_df(df,True))

In [8]:
# low_ms = convert_ms_df(stdoan_low, True)
# med_ms = convert_ms_df(stdoan_med, True)
# high_ms = convert_ms_df(stdoan_high, True)

In [9]:
low_resamples = []
for df in low_ms:
    low_resamples.append(df.resample('500ms', on='Time').sum())
    
threesixty_resamples = []
for df in threesixty_ms:
    threesixty_resamples.append(df.resample('500ms', on='Time').sum())
    
med_resamples = []
for df in med_ms:
    med_resamples.append(df.resample('500ms', on='Time').sum())
    
seventwenty_resamples = []
for df in seventwenty_ms:
    seventwenty_resamples.append(df.resample('500ms', on='Time').sum())
    
high_resamples = []
for df in high_ms:
    high_resamples.append(df.resample('500ms', on='Time').sum())

In [10]:
# low_resample = low_ms.resample('500ms', on='Time').sum()
# med_resample = med_ms.resample('500ms', on='Time').sum()
# high_resample = high_ms.resample('500ms', on='Time').sum()


## Aggregate Features

In [11]:
## take the aggregate features of the whole chunk; download and upload
def agg_feat(df, col):
    return [np.mean(df[col]), np.std(df[col])]

## take the ratio of upload:download packets
def pkt_ratio(df):
    ms_df = convert_ms_df(df, True)
    local = np.sum(ms_df['pkt_src'] == '1') 
    server = np.sum(ms_df['pkt_src'] == '2') 
    return local / server

## take the ratio of upload:download bytes
def bytes_ratio(df):
    local = df['1->2Bytes'].sum()
    server = df['2->1Bytes'].sum()
    return local / server

## Peak Related Aggregate Features

In [12]:
## finds the peaks with mean + 2(1) std
## run the above aggregate functions on the peaks only??

def get_peak_loc(df, col, invert=False):
  'invert arg allows you to get values not considered peaks'
  df_avg = df[col].mean()
  df_std = df[col].std()
  
  threshold = df_avg + (1 * df_std)
  
  if invert:
    return np.array(df[col] < threshold)
  
  else:
    return np.array(df[col] > threshold)

## np.mean, np.var, np.std - think of more?  
def peak_time_diff(df, col):
  '''
  mess around with the different inputs for function. 
  variance seems to inflate the difference betweent the two the most with litte
  to no data manipulation. however, currently trying things like
  squaring the data before taking the aggregate function to exaggerate
  differences (moderate success??)
  '''
  peaks = df[get_peak_loc(df, col)]
  peaks['Time'] = peaks['Time'] - peaks['Time'].min()
  time_diff = np.diff(peaks['Time'] ** 2)
  return [np.mean(time_diff), np.std(time_diff)]

In [13]:
from scipy.signal import find_peaks

def peak_times(df,col,thresh):
    x = df[col]
    peaks, _ = find_peaks(x, height=thresh)
    if list(peaks) == []:
        return [-1]
    times = df.iloc[peaks]['Time'].values
    time_between_peaks = [times[i]-times[i-1]for i in range(1,len(times))]
    #print(time_between_peaks)
    #time_between_peaks[0]=0
    if time_between_peaks == []:
        return -1
    return time_between_peaks

def num_peaks(df,col,thresh):
    x = df[col]
    peaks, _ = find_peaks(x, height=thresh)
    return len(peaks)

## Spectral Features

In [14]:
def spectral_features(df, col):

    """
    welch implemention of spectral features
    resample the data before inputting (might change prereq depending on
    resource allocation)
    """

    f, Pxx_den = sp.signal.welch(df[col], fs=2)
    Pxx_den = np.sqrt(Pxx_den)

    peaks = sp.signal.find_peaks(Pxx_den)[0]
    prominences = sp.signal.peak_prominences(Pxx_den, peaks)[0]

    idx_max = prominences.argmax()
    loc_max = peaks[idx_max]

    return [f[loc_max], Pxx_den[loc_max], prominences[idx_max]]

## Chunking & Feature creation

In [15]:
## wip; need to decide chunk size eventually
## should we also make this chunking feature be our feature creation?

def chunk_data(df, interval=60):

    """
    takes in a filepath to the data you want to chunk and feature engineer
    chunks our data into a specified time interval
    each chunk is then turned into an observation to be fed into our classifier
    """

    df_list = []
    
    df['Time'] = df['Time'] - df['Time'].min()
    
    total_chunks = np.floor(df['Time'].max() / interval).astype(int)

    for chunk in np.arange(total_chunks):
      
        start = chunk * interval
        end = (chunk+1) * interval

        temp_df = (df[(df['Time'] >= start) & (df['Time'] < end)])
        
        df_list.append(temp_df)
        
    return df_list

In [16]:
def create_features(dfs, interval=60):

  features = [
    'dwl_peak_freq',
    'dwl_peak_prom',
    'dwl_max_psd',
    'dwl_bytes_avg',
    'dwl_bytes_std',
    'dwl_peak_avg',
    'dwl_peak_std',
    'upl_peak_freq',
    'upl_peak_prom',
    'upl_max_psd',
    'upl_bytes_avg',
    'upl_bytes_std',
    'upl_peak_avg',
    'upl_peak_std',
    'dwl_time_peak',#'IMAN_up_time_peak',
      'dwl_num_peak'#,'IMAN_up_num_peak'
  ]  

  vals = []
  for df in dfs:
      df_chunks = chunk_data(df, interval)

      for chunk in df_chunks:

        preproc = convert_ms_df(chunk, True)
        upl_bytes = preproc[preproc['pkt_src'] == '1'].resample('500ms', on='Time').sum()
        dwl_bytes = preproc[preproc['pkt_src'] == '2'].resample('500ms', on='Time').sum()

        ## spectral features
        dwl_spectral = spectral_features(dwl_bytes, 'pkt_size')
        upl_spectral = spectral_features(upl_bytes, 'pkt_size')

        ## aggregate features
        dwl_agg = agg_feat(chunk, '2->1Bytes')
        upl_agg = agg_feat(chunk, '1->2Bytes')

        ## peak features
        dwl_peak = peak_time_diff(chunk, '2->1Bytes')
        upl_peak = peak_time_diff(chunk, '1->2Bytes')
        
        ## iman's time between peak 
        iman_dwn_time_peak = np.mean(peak_times(chunk,'2->1Bytes',1000000))
        #iman_up_time_peak = np.mean(peak_times(chunk,'1->2Bytes',50000))
        
        ## iman's num peak
        iman_dwn_num_peak = num_peaks(chunk,'2->1Bytes',1000000)
        #iman_up_num_peak = num_peaks(chunk,'1->2Bytes',50000)


        
        feat_val = np.hstack((
          dwl_spectral,
          dwl_agg,
          dwl_peak,
          upl_spectral,
          upl_agg,
          upl_peak,
            iman_dwn_time_peak,#iman_up_time_peak,
            iman_dwn_num_peak,#iman_up_num_peak
        ))

        vals.append(feat_val)
    
  return pd.DataFrame(columns=features, data=vals).fillna(0)

## Dev Playground

In [35]:
chunk_size = 300

## Extended Model (low/med/high)

### Create Features

In [36]:
%%time
low_feat = create_features(low_dfs, chunk_size)
threesixty_feat = create_features(threesixty_dfs, chunk_size)
med_feat = create_features(med_dfs, chunk_size)
seventwenty_feat = create_features(seventwenty_dfs, chunk_size)
high_feat = create_features(high_dfs, chunk_size)

CPU times: user 1min 14s, sys: 737 ms, total: 1min 15s
Wall time: 1min 15s


In [37]:
low_feat['resolution'] = 1
threesixty_feat['resolution'] = 1
med_feat['resolution'] = 2
seventwenty_feat['resolution'] = 3
high_feat['resolution'] = 3

In [38]:
training = pd.concat([low_feat, threesixty_feat, med_feat,seventwenty_feat, high_feat]).reset_index(drop=True)
#training = pd.concat([low_feat, med_feat, high_feat]).reset_index(drop=True)

In [39]:
## SELECT SUBSETS OF FEATURES

#training = training[['dwl_bytes_avg','dwl_peak_prom','upl_bytes_std','dwl_bytes_std','upl_peak_std','resolution']]
#training = training[['dwl_bytes_avg','upl_max_psd','dwl_max_psd','upl_peak_prom','dwl_num_peak','dwl_peak_prom','resolution']]
#training = training[['dwl_max_psd','upl_max_psd','dwl_peak_prom','upl_peak_prom','dwl_num_peak','dwl_bytes_avg','upl_bytes_std','upl_bytes_avg','resolution']]


In [40]:
training

Unnamed: 0,dwl_peak_freq,dwl_peak_prom,dwl_max_psd,dwl_bytes_avg,dwl_bytes_std,dwl_peak_avg,dwl_peak_std,upl_peak_freq,upl_peak_prom,upl_max_psd,upl_bytes_avg,upl_bytes_std,upl_peak_avg,upl_peak_std,dwl_time_peak,dwl_num_peak,resolution
0,0.203125,2.067111e+05,1.575466e+05,74919.477178,234944.058940,3313.500000,3626.211338,0.398438,10126.192987,7042.031568,5038.771784,13131.996598,3180.960000,3611.240845,13.000000,2.0,1
1,0.835938,1.695320e+05,1.190181e+05,52953.709184,189342.162055,5207.142857,4297.348233,0.101562,9795.036377,7680.135744,3749.346939,9452.349820,5207.142857,4297.348233,-1.000000,1.0,1
2,0.234375,1.458635e+05,1.273509e+05,53921.263441,187013.108307,5168.642857,4197.561139,0.101562,9209.022157,7792.282403,3889.161290,9249.333985,5168.642857,4197.561139,-1.000000,1.0,1
3,0.023438,1.582415e+05,1.249173e+05,54971.453202,177587.670312,4389.062500,5208.934614,0.328125,8731.614547,5625.941138,4382.896552,9771.021738,4009.263158,5296.348433,-1.000000,0.0,1
4,0.031250,2.183220e+05,1.934314e+05,61085.956522,184739.231965,4513.470588,4773.580591,0.031250,10758.495890,9412.021724,4099.266304,9195.293407,4262.722222,4752.317117,-1.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,0.187500,1.132210e+06,1.067360e+06,272979.757848,607287.699478,3024.321429,2261.033251,0.187500,54511.415466,51763.803227,14006.955157,28828.563426,2920.034483,2245.368897,10.769231,27.0,3
274,0.187500,1.172365e+06,1.107221e+06,283183.631579,643340.966516,2712.903226,1970.125759,0.187500,56393.386537,53164.228734,14486.934211,30594.990566,2712.903226,1970.125759,10.035714,29.0,3
275,0.109375,1.126711e+06,1.037700e+06,329029.936073,666420.208304,2556.125000,1817.073226,0.109375,54170.918310,50284.230909,16728.109589,31617.181082,2556.125000,1817.073226,9.862069,30.0,3
276,0.093750,1.277462e+06,1.233146e+06,259038.224576,579092.410102,2574.031250,3048.064411,0.093750,60714.980350,58427.532006,13478.610169,27556.855153,2657.064516,3061.068366,10.200000,26.0,3


### Train model

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score,accuracy_score

In [42]:
X, y = training.drop(columns=['resolution']), training['resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4,stratify=training['resolution'])

In [43]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
#classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

### Test Set

In [46]:
## RUN ON TEST DATA

low_test = pd.read_csv('../data/test/sgs008-109-action-240p-20210202.csv')
med_test = pd.read_csv('../data/test/sgs008-109-action-480p-20210202.csv')
high_test = pd.read_csv('../data/test/sgs008-109-action-1080p-20210202.csv')
threesixty_test = pd.read_csv('../data/test/sgs008-109-action-360p-20210213.csv')
seventwenty_test = pd.read_csv('../data/test/sgs008-109-action-720p-20210213.csv')
#low_test = pd.read_csv('../data/test/sgs008-109-action-360p-20210213.csv')
#low_test = pd.read_csv('../data/test/stdoan-102-action-720p-20201206.csv')

low_test_feat = create_features([low_test], chunk_size)
threesixty_test_feat = create_features([threesixty_test], chunk_size)
med_test_feat = create_features([med_test], chunk_size)
seventwenty_test_feat = create_features([seventwenty_test], chunk_size)
high_test_feat = create_features([high_test], chunk_size)

low_test_feat['resolution'] = 1
threesixty_test_feat['resolution'] = 1
med_test_feat['resolution'] = 2
seventwenty_test_feat['resolution'] = 3
high_test_feat['resolution'] = 3

test = pd.concat([low_test_feat, threesixty_test_feat, med_test_feat,seventwenty_test_feat, high_test_feat])

classifier.fit(X,y)
y_pred = classifier.predict(test.drop('resolution',axis=1))
(pd.crosstab(test['resolution'], y_pred, rownames=['Actual Group'], colnames=['Predicted Group']))

Predicted Group,1,2,3
Actual Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11,1,0
2,0,5,1
3,0,0,12


In [47]:
accuracy_score(test['resolution'], y_pred)

0.9333333333333333

In [34]:
import pickle
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as f:
    pickle.dump(classifier, f)

In [32]:
# test = create_features([pd.read_csv('../data/test/testme.csv')],chunk_size)
# classifier.predict(test)

array([3, 2, 1, 2, 1])

### Validation Set

In [44]:
## FOR VALIDATION SET

y_pred = classifier.predict(X_test)

In [45]:
(pd.crosstab(y_test, y_pred, rownames=['Actual Group'], colnames=['Predicted Group']))

Predicted Group,1,2,3
Actual Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,23,6,0
2,1,10,4
3,1,2,23


In [28]:
f1_score(y_test, y_pred, average=None)

array([0.88741722, 0.69444444, 0.89051095])

In [29]:
accuracy_score(y_test, y_pred)

0.85

In [72]:
from sklearn.model_selection import cross_val_score
cross_val_score(classifier,X,y,cv=10)

array([0.76666667, 0.53571429, 0.75      , 0.75      , 0.82142857,
       0.64285714, 0.71428571, 0.67857143, 0.7037037 , 0.56      ])

### Feature Importances

In [57]:
# features = ['dwl_bytes_avg','dwl_peak_prom','upl_bytes_std','dwl_bytes_std','upl_peak_std']
# importances = classifier.feature_importances_
# indices = np.argsort(importances)[::-1]
# for i in indices:
#     print(features[i],': ',importances[i])
    

In [58]:
# features = ['dwl_bytes_avg','upl_max_psd','dwl_max_psd','upl_peak_prom','dwl_num_peak','dwl_peak_prom']
# importances = classifier.feature_importances_
# indices = np.argsort(importances)[::-1]
# for i in indices:
#     print(features[i],': ',importances[i])

In [64]:
features = ['dwl_peak_freq','dwl_peak_prom','dwl_max_psd','dwl_bytes_avg','dwl_bytes_std','dwl_peak_avg',
            'dwl_peak_std','upl_peak_freq','upl_peak_prom','upl_max_psd','upl_bytes_avg','upl_bytes_std',
            'upl_peak_avg','upl_peak_std','dwl_time_peak','dwl_num_peak']
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]
for i in indices:
    print(features[i],': ',importances[i])
    

dwl_peak_prom :  0.13713088380289176
dwl_num_peak :  0.12245980947453192
dwl_max_psd :  0.11984258111516699
dwl_time_peak :  0.11032937896712422
upl_max_psd :  0.07179857958734272
dwl_bytes_std :  0.06556642367311044
upl_peak_prom :  0.06496325641984468
dwl_bytes_avg :  0.04632395553127444
upl_bytes_std :  0.044282375150057056
dwl_peak_avg :  0.03945016693292953
upl_bytes_avg :  0.03772954757472542
dwl_peak_std :  0.03490470282877489
upl_peak_avg :  0.030256294796682515
upl_peak_std :  0.02789525781953951
upl_peak_freq :  0.023926239489693947
dwl_peak_freq :  0.023140546836309957


In [32]:
# ## testing that feature method functions correctly

# l_start = 0 
# l_end = 60

# test_chunk = stdoan_low.copy()
# test_chunk['Time'] = test_chunk['Time'] - test_chunk['Time'].min()
# low_chunk = stdoan_low[(stdoan_low['Time'] >= 0) & (stdoan_low['Time'] < 60)]

# low_chunk_ms = convert_ms_df(low_chunk, True)

# upl_ms = low_chunk_ms[low_chunk_ms['pkt_src'] == '1']
# dwl_ms = low_chunk_ms[low_chunk_ms['pkt_src'] == '2']

# dwl_chunk_rs = dwl_ms.resample('500ms', on='Time').sum()

# f_dwl, Pxx_dwl = sp.signal.welch(dwl_chunk_rs['pkt_size'], fs=2)

## Basline Model (3 resolutions)

In [101]:
## BASELINE MODEL

baseline_training = pd.concat([low_feat, med_feat, high_feat]).reset_index(drop=True)
X, y = baseline_training.drop(columns=['resolution']), baseline_training['resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4,stratify=baseline_training['resolution'])
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
(pd.crosstab(y_test, y_pred, rownames=['Actual Group'], colnames=['Predicted Group']))

Predicted Group,1,3,5
Actual Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,13,2,0
3,1,14,0
5,0,0,13


In [102]:
accuracy_score(y_test, y_pred)

0.9302325581395349

In [103]:
## RUN ON TEST DATA

low_test = pd.read_csv('../data/test/sgs008-109-action-240p-20210202.csv')
med_test = pd.read_csv('../data/test/sgs008-109-action-480p-20210202.csv')
high_test = pd.read_csv('../data/test/sgs008-109-action-1080p-20210202.csv')
# threesixty_test = pd.read_csv('../data/test/sgs008-109-action-360p-20210213.csv')
# seventwenty_test = pd.read_csv('../data/test/sgs008-109-action-720p-20210213.csv')
#low_test = pd.read_csv('../data/test/sgs008-109-action-360p-20210213.csv')
#low_test = pd.read_csv('../data/test/stdoan-102-action-720p-20201206.csv')

low_test_feat = create_features([low_test], chunk_size)
#threesixty_test_feat = create_features([threesixty_test], chunk_size)
med_test_feat = create_features([med_test], chunk_size)
#seventwenty_test_feat = create_features([seventwenty_test], chunk_size)
high_test_feat = create_features([high_test], chunk_size)

low_test_feat['resolution'] = 1
#threesixty_test_feat['resolution'] = 2
med_test_feat['resolution'] = 3
#seventwenty_test_feat['resolution'] = 4
high_test_feat['resolution'] = 5

test = pd.concat([low_test_feat, med_test_feat, high_test_feat])

classifier.fit(X,y)
y_pred = classifier.predict(test.drop('resolution',axis=1))
(pd.crosstab(test['resolution'], y_pred, rownames=['Actual Group'], colnames=['Predicted Group']))

Predicted Group,1,3,5
Actual Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6,0,0
3,0,6,0
5,0,1,5


In [104]:
accuracy_score(test['resolution'], y_pred)

0.9444444444444444