# import stuff

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import train_test_split
import pickle

# set up spark environment

In [None]:
import os
memory = '10g'
#memory = '100g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
SparkContext.setSystemProperty('spark.executor.memory', '8g')
sc = SparkContext('local','example')  # if using locally
sql_sc = SQLContext(sc)

# loading dataset

In [None]:
def loading_data(dataset):
    dataset=sql_sc.read.format('csv').options(header='true', inferSchema='true').load(dataset)
    # #changing column header name
    dataset = dataset.select(*[col(s).alias('Label') if s == ' Label' else s for s in dataset.columns])
    #to change datatype
    dataset=dataset.drop('External IP')
    dataset = dataset.filter(dataset.Label.isNotNull())
    dataset=dataset.filter(dataset.Label!=' Label')#filter Label from label
    print(dataset.groupBy('Label').count().collect())
    return dataset

# invoking

In [None]:
#load data
ds_path_final = '/home/kamat/Documents/kamat_MA/ids_with_rbfn/dataset/final.csv'
dataset=loading_data(ds_path_final)
print("loading dataset done")

# playing with the pyspark dataframe

In [None]:
def data_preprocessing(dataset):
    """
    This function is for preprocessing of datasets.
    :param dataset: a spark dataframe
    :return finalRegressionData: a spark dataframe after preprocessing 
    """
    featureList=[' Flow Duration', ' Fwd IAT Min', ' Bwd IAT Mean', ' Fwd IAT Mean','Init_Win_bytes_forward',' Subflow Fwd Bytes','Total Length of Fwd Packets',
      ' ACK Flag Count', ' Active Min', 'Active Mean',' Flow IAT Std','Init_Win_bytes_forward','Fwd PSH Flags',' SYN Flag Count',
      'Fwd Packets/s',' Bwd Packet Length Std','Total Length of Fwd Packets','Init_Win_bytes_forward',' Init_Win_bytes_backward','Total Length of Fwd Packets',
      'Total Length of Fwd Packets','Active Mean','Total Length of Fwd Packets',' Fwd Packet Length Mean',' Average Packet Size','Init_Win_bytes_forward', ' Bwd Packets/s', ' PSH Flag Count', ' Flow IAT Min', ' Fwd IAT Min', ' Flow IAT Mean']

    uniqueFeature=list(set(featureList))
    uniqueFeature.append('Label')
    #data set for regression
    dataForRegression=dataset.select([c for c in dataset.columns if c in uniqueFeature])
    return dataForRegression

# pyspark DF -> pandas DF

In [None]:
pd_dfr = data_preprocessing(dataset).toPandas()
featureList=[' Flow Duration', ' Fwd IAT Min', ' Bwd IAT Mean', ' Fwd IAT Mean','Init_Win_bytes_forward',' Subflow Fwd Bytes','Total Length of Fwd Packets',
      ' ACK Flag Count', ' Active Min', 'Active Mean',' Flow IAT Std','Init_Win_bytes_forward','Fwd PSH Flags',' SYN Flag Count',
      'Fwd Packets/s',' Bwd Packet Length Std','Total Length of Fwd Packets','Init_Win_bytes_forward',' Init_Win_bytes_backward','Total Length of Fwd Packets',
      'Total Length of Fwd Packets','Active Mean','Total Length of Fwd Packets',' Fwd Packet Length Mean',' Average Packet Size','Init_Win_bytes_forward', ' Bwd Packets/s', ' PSH Flag Count', ' Flow IAT Min', ' Fwd IAT Min', ' Flow IAT Mean']

X_p = pd_dfr[featureList]
Y_p = pd_dfr['Label']

In [None]:
one_hot_encoded_labels = pd.get_dummies(Y_p)

## convert pandas DF -> Numpy

In [None]:
X = X_p.replace('�', 0, regex=True).values
Y = one_hot_encoded_labels.values

In [None]:
X_f = X.astype(float)

Y_f = Y.astype(float)

In [None]:
print(X_f.shape, Y_f.shape)

## running SVMSMOTE => error occurs at the next cell.

In [None]:
sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X_f, Y_f)

# splitting train and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size=0.20, random_state=0,stratify=Y_res)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.10, random_state=0,stratify=Y_train)

In [None]:
print(X_train.shape,
Y_train.shape,
X_val.shape,
Y_val.shape, X_test.shape, Y_test.shape)

# storing for another notebook

In [None]:
%store X_train
%store X_test
%store Y_train
%store Y_test
%store X_val
%store Y_val