In [3]:
import pandas as pd
import numpy as np
import os

# This function returns a dataframe with the packets times, sizes, and directions for a single row of data.
# This will be used within our other functions to help create the features.
def three_cols(row):
    time = list(map(int, row['packet_times'].split(';')[0:-1]))
    size = list(map(int, row['packet_sizes'].split(';')[0:-1]))
    dirs = list(map(int, row['packet_dirs'].split(';')[0:-1]))
    dict1 = {'packet_time': time, 'packet_size': size, 'packet_dir': dirs}
    return pd.DataFrame(dict1)

# This function takes all the counts of the 0-300bytes for the 1->2 Direction and all the counts
# of the 1200-1500bytes for the 2->1 Direction and creates sum values for the two features per dataset.
# uses the three_cols function as a helper function
def big_byte_count_feature(dataset):        
    packet_size_count1 = []
    packet_size_count2 = []
    for i in range(dataset.shape[0]):
        row = three_cols(dataset.iloc[i])
        ones = row.loc[row['packet_dir'] == 1]['packet_size']
        twos = row.loc[row['packet_dir'] == 2]['packet_size']
        one_count=0
        two_count=0
        for packet in ones:
            if (int(packet) >= 0) and (int(packet) <= 300):
                one_count += 1
        for packet in twos:
            if (int(packet) >= 1200) and (int(packet) <= 1500):
                two_count += 1
        packet_size_count1.append(one_count)
        packet_size_count2.append(two_count)
    return [sum(packet_size_count1), sum(packet_size_count2)]
  
  
  # input: filepaths
# output: 4 lists -> associated file names, labels, feature1, feature2
# uses the big_byte_count_feature as a helper function
def features_labels(filepath):
    Dir1_ByteCount_0to300_feature = []
    Dir2_ByteCount_1200to1500_feature = []
    labels = []
    file_names = []
    files = os.listdir(filepath)
    for file in files:
        if ('novpn' in file) or (file[:2] == '._'):
            continue
        if 'novideo' in file:
            labels.append(0)
        else:
            labels.append(1)
        file_names.append(file)
        df = pd.read_csv(filepath + '/' + file)
        sum_values = big_byte_count_feature(df)
        Dir1_ByteCount_0to300_feature.append(sum_values[0])
        Dir2_ByteCount_1200to1500_feature.append(sum_values[1])
    feature_df = pd.DataFrame(data={'Dir1_ByteCount_0to300_feature': Dir1_ByteCount_0to300_feature,
                                    'Dir2_ByteCount_1200to1500_feature': Dir2_ByteCount_1200to1500_feature})
    return file_names, labels, feature_df 

# accesses the data file found within the data folder and creates the features and label for it
# uses the big_byte_count_feature as a helper function
def input_feature_label(filepath):
    Dir1_ByteCount_0to300_feature = []
    Dir2_ByteCount_1200to1500_feature = []
    labels = []
    file_names = []
    files = os.listdir(filepath)
    for file in files:
        if ('novpn' in file) or (file[:2] == '._'):
            return "File Invalid. Must be vpn data, nor can it be empty."
        if 'novideo' in file:
            labels.append(0)
        else:
            labels.append(1)
        file_names.append(file)
        df = pd.read_csv(filepath + file)
        sum_values = big_byte_count_feature(df)
        Dir1_ByteCount_0to300_feature.append(sum_values[0])
        Dir2_ByteCount_1200to1500_feature.append(sum_values[1])
    feature_df = pd.DataFrame(data={'Dir1_ByteCount_0to300_feature': Dir1_ByteCount_0to300_feature,
                                    'Dir2_ByteCount_1200to1500_feature': Dir2_ByteCount_1200to1500_feature})
    return file_names, labels, feature_df

In [4]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

#Trains, tests, and splits the data up so that RandomForestClassifier can be used 
#to train on the data and then determine how accurate the model is
def ml_model_analysis(X, y):
    model = RandomForestClassifier()
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25, random_state=42)
    model = model.fit(X_tr[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']],y_tr)
    prediction_test = model.predict(X_ts[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    prediction_train = model.predict(X_tr[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    print (("Base test accuracy", metrics.accuracy_score(y_ts, prediction_test)), 
            ("Base Train Accuracy", metrics.accuracy_score(y_tr, prediction_train)))
    return prediction_test, y_ts

  
#Trains the model on all the data found within the GoodData on dsmlp, and then predicts 
#whether streaming or not for the input data chunk entered
def ml_model_train(X, y, input_X, input_y, filename):
    model = RandomForestClassifier()
    model = model.fit(X[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']],y)
    # save the model to temp/model folder
    joblib.dump(model, filename)
    return

def classifer(input_model, filename):
    loaded_model = joblib.load(filename)
    prediction = loaded_model.predict(input_X[['Dir1_ByteCount_0to300_feature','Dir2_ByteCount_1200to1500_feature']])
    for i in range(0, len(prediction)):
        if bool(prediction[i]) == bool(input_y[i]):
            val = "Yes"
        else:
            val = "No"
        print("is_streaming? Prediction Value: " + str(bool(prediction[i])), "is_streaming? True Value: " + str(bool(input_y[i])), "classified correctly? : " + val)
    return