In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
# grabs all the data stored on the dsmlp server of all the files every student uploaded in the domain
filepath = "/teams/DSC180A_FA20_A00/b05vpnxray/data/unzipped"
data = os.listdir(filepath)

In [5]:
# This function returns a dataframe with the packets times, sizes, and directions for a single row of data
def three_cols(row):
    time = list(map(int, row['packet_times'].split(';')[0:-1]))
    size = list(map(int, row['packet_sizes'].split(';')[0:-1]))
    dirs = list(map(int, row['packet_dirs'].split(';')[0:-1]))
    dict1 = {'packet_time': time, 'packet_size': size, 'packet_dir': dirs}
    return pd.DataFrame(dict1)

# This function takes all the counts of the 0-300bytes for the 1->2 Direction and all the counts
# of the 1200-1500bytes for the 2->1 Direction and creates sum values for the two features per dataset.
# uses the three_cols function as a helper function
def big_byte_count_feature(dataset):        
    packet_size_count1 = []
    packet_size_count2 = []
    for i in range(dataset.shape[0]):
        row = three_cols(dataset.iloc[i])
        ones = row.loc[row['packet_dir'] == 1]['packet_size']
        twos = row.loc[row['packet_dir'] == 2]['packet_size']
        one_count=0
        two_count=0
        for packet in ones:
            if (int(packet) >= 0) and (int(packet) <= 300):
                one_count += 1
        for packet in twos:
            if (int(packet) >= 1200) and (int(packet) <= 1500):
                two_count += 1
        packet_size_count1.append(one_count)
        packet_size_count2.append(two_count)
    return [sum(packet_size_count1), sum(packet_size_count2)]

In [15]:
# creates the features and labels for a set list of datasets
# input: filepaths
# output: 4 lists -> associated file names, labels, feature1, feature2
# uses the big_byte_count_feature as a helper function
def features_labels(files):
    Dir1_ByteCount_0to300_feature = []
    Dir2_ByteCount_1200to1500_feature = []
    labels = []
    file_names = []
    for file in files:
        if ('novpn' in file) or (file[:2] == '._'):
            continue
        if 'novideo' in file:
            labels.append(0)
        else:
            labels.append(1)
        file_names.append(file)
        df = pd.read_csv('/teams/DSC180A_FA20_A00/b05vpnxray/data/unzipped/' + file)
        sum_values = big_byte_count_feature(df)
        Dir1_ByteCount_0to300_feature.append(sum_values[0])
        Dir2_ByteCount_1200to1500_feature.append(sum_values[1])
    feature_df = pd.DataFrame(data={'Dir1_ByteCount_0to300_feature': Dir1_ByteCount_0to300_feature,
                                    'Dir2_ByteCount_1200to1500_feature': Dir2_ByteCount_1200to1500_feature})
    return file_names, labels, feature_df 

# accesses the data file found within the data folder and creates the features and label for it
# uses the big_byte_count_feature as a helper function
def input_feature_label():
    Dir1_ByteCount_0to300_feature = []
    Dir2_ByteCount_1200to1500_feature = []
    labels = []
    file_names = []
    filepath = "input_data"
    files = os.listdir(filepath)
    for file in files:
        if ('novpn' in file) or (file[:2] == '._'):
            return "File Invalid. Must be vpn data, nor can it be empty."
        if 'novideo' in file:
            labels.append(0)
        else:
            labels.append(1)
        file_names.append(file)
        df = pd.read_csv('input_data/' + file)
        sum_values = big_byte_count_feature(df)
        Dir1_ByteCount_0to300_feature.append(sum_values[0])
        Dir2_ByteCount_1200to1500_feature.append(sum_values[1])
    feature_df = pd.DataFrame(data={'Dir1_ByteCount_0to300_feature': Dir1_ByteCount_0to300_feature,
                                    'Dir2_ByteCount_1200to1500_feature': Dir2_ByteCount_1200to1500_feature})
    return file_names, labels, feature_df 

In [7]:
#Trains, tests, and splits the data up so that RandomForestClassifier can be used 
#to train on the data and then determine how accuracte the model is
def ml_model_analysis(X, y):
    model = RandomForestClassifier()
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25, random_state=42)
    model = model.fit(X_tr[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']],y_tr)
    prediction_test = model.predict(X_ts[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    prediction_train = model.predict(X_tr[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    print (("Base test accuracy", metrics.accuracy_score(y_ts, prediction_test)), 
            ("Base Train Accuracy", metrics.accuracy_score(y_tr, prediction_train)))
    print (("Base test Confusion Matrix", metrics.confusion_matrix(y_ts, prediction_test)), 
            ("Base Train Confusion Matrix", metrics.confusion_matrix(y_tr, prediction_train)))
    return prediction_test, y_ts

In [8]:
#Trains the model on all the data found within the GoodData on dsmlp, and then predicts 
#whether streaming or not for the input data chunk entered
def ml_model_train(X, y, input_X, input_y):
    model = RandomForestClassifier()
    model = model.fit(X[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']],y)
    prediction = model.predict(input_X[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    if prediction == 1:
        val = True
    else:
        val = False
    return ("Prediction Value: " + str(prediction[0]), "True Value: " + str(input_y[0]), 'is_streaming? : ' + str(val))

In [9]:
# #This is to find the optimal paramaters for our DTC
# tree_para = {"tree__max_depth":[2,3,4,5,6,7,8,9,10,11,12,15,20,30,40,None], 
#     'tree__min_samples_split':[2,3,5,7,10,15,20],
#     'tree__min_samples_leaf':[2,3,5,7,10,15,20],
#     "tree__criterion":["gini","entropy"]}
# model = GridSearchCV(ps, param_grid=tree_para, cv=3, n_jobs=-1, verbose=1)
# model = model.fit(X_tr[["Country","DollarPerView"]],y_tr)
#Gives us values of optimal params
#model.best_params_

## Running the Code

In [10]:
# This runs the features_labels function and creates the features and the labels for the data present on the dsmlp server
file_names, file_labels, new_df = features_labels(data)

In [11]:
#call the ml_model_analysis function that inputs the features and labels, 
#trains the ml algorithm, and outputs the accuracy of the algorithm
prediction_labels, actual_labels = ml_model_analysis(new_df, file_labels)

('Base test accuracy', 0.7692307692307693) ('Base Train Accuracy', 1.0)
('Base test Confusion Matrix', array([[ 0,  4],
       [ 2, 20]])) ('Base Train Confusion Matrix', array([[15,  0],
       [ 0, 60]]))


Using RandomForestClassifier, I recieved a test accuracy of 91%. When run over multiple times, the test accuracy averages out to roughly 88%, which is still higher overall than if I were to use a DecisionTreeClassifier and recieve an accuracy of 85%.

In [16]:
file_name, input_label, input_features = input_feature_label()
ml_model_train(new_df, file_labels, input_features, input_label)

('Prediction Value: 0', 'True Value: 0', 'is_streaming? : False')

From the cell above, our prediction determined that the data file was indeed streaming data. By looking at the cell below, we can confirm that it was indeed streaming data.

In [None]:
file_name

In [18]:
print("Prediction Labels: ")
print(prediction_labels)
print("Actual Labels: ")
print(actual_labels)

Prediction Labels: 
[1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Actual Labels: 
[1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
