## PCA on both test and train datasets
### test data has been transformed wrt to the train data principal components

In [1]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import gc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### working on the training data

In [2]:
train_data = dd.read_csv('/home/aban/somu/review-2/processed_data.csv')
train_data = train_data.drop(['Unnamed: 0'],axis=1)
train_data.columns = ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
       'Bwd Pkt Len Mean', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Bwd IAT Tot',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt',
       'URG Flag Cnt', 'Down/Up Ratio', 'Fwd Seg Size Avg',
       'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min',
       'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Min','Label']
train_data = train_data.astype({'Label': np.int8}).compute()

x_train = train_data.iloc[:,0:46]
y_train = train_data['Label']


### standardization of the train dataset (Zero-centering of the data)

In [3]:
scaler = StandardScaler()
scaler.fit(x_train)
scaled_train = scaler.transform(x_train)


### Finding the 'k' principal components which can retain 95% of the variance

In [None]:
Covar = np.cov(scaled_train,rowvar=False)
u,s,v = np.linalg.svd(Covar)
vs = np.sum(s)


In [4]:
def find_k(s):
    ss =0
    k = 0
    for i in s:
        k = k+1
        ss = ss + i
        #print(ss/vs)
        if(ss/vs>=0.95):
            return k


In [5]:
k = find_k(s)
print(k)

24


### working on the test data

In [6]:
test_data = pd.read_csv('/home/aban/somu/review-2/test_togive.csv')
test_data = test_data[['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts','TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min','Fwd Pkt Len Mean', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min','Bwd Pkt Len Mean', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean','Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Bwd IAT Tot','Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min','Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Pkts/s', 'Bwd Pkts/s','Pkt Len Min', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var','FIN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt','URG Flag Cnt', 'Down/Up Ratio', 'Fwd Seg Size Avg','Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min','Active Mean', 'Active Std', 'Active Max', 'Active Min','Idle Min']]
# for column in test_data:
#  if test_data[column].dtype == 'float64':
#      test_data[column]=pd.to_numeric(test_data[column], downcast='float')
#  if test_data[column].dtype == 'int64':
#      test_data[column]=pd.to_numeric(test_data[column], downcast='integer')



In [7]:
test_desc = test_data.describe()

In [9]:
for col in test_desc:
    print(test_desc[col])

count    1000000.000000
mean        8976.166102
std        18751.140849
min            0.000000
25%           53.000000
50%           80.000000
75%         3389.000000
max        65534.000000
Name: Dst Port, dtype: float64
count    1000000.000000
mean           8.766909
std            4.928834
min            0.000000
25%            6.000000
50%            6.000000
75%           17.000000
max           17.000000
Name: Protocol, dtype: float64
count    1.000000e+06
mean     1.128385e+07
std      9.195424e+08
min     -9.190110e+11
25%      5.060000e+02
50%      2.197350e+04
75%      3.005524e+06
max      1.200000e+08
Name: Flow Duration, dtype: float64
count    1000000.000000
mean          24.694678
std         1599.647020
min            1.000000
25%            1.000000
50%            2.000000
75%            5.000000
max       243585.000000
Name: Tot Fwd Pkts, dtype: float64
count    1000000.000000
mean           6.577036
std          170.671400
min            0.000000
25%            0.00

In [10]:
col_name = test_data.columns.to_series()[np.isinf(test_data).any()]
print(col_name)

Flow Byts/s    Flow Byts/s
Flow Pkts/s    Flow Pkts/s
dtype: object


In [14]:
nan_col_name = test_data.columns.to_series()[np.isnan(test_data).any()]
print(nan_col_name)

Series([], dtype: object)


In [12]:
for column in col_name:
    infi_count = np.isinf(test_data[column]).values.sum()
    print("Column {} has {} infinite values".format(column, infi_count))

Column Flow Byts/s has 2323 infinite values
Column Flow Pkts/s has 2323 infinite values


In [15]:
test_data['Flow Byts/s']=test_data['Flow Byts/s'].replace([np.inf, -np.inf], np.nan)
test_data['Flow Pkts/s']=test_data['Flow Pkts/s'].replace([np.inf, -np.inf], np.nan)

In [16]:
byts_max = test_data['Flow Byts/s'].max()
pkts_max = test_data['Flow Pkts/s'].max()
test_data['Flow Byts/s']=test_data['Flow Byts/s'].fillna(2*byts_max)
test_data['Flow Pkts/s']=test_data['Flow Pkts/s'].fillna(2*pkts_max)

In [27]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [9]:
train_desc = x_train.describe()
mean_vec = train_desc.iloc[1:2,:].to_numpy()
std_vec = train_desc.iloc[2:3,:].to_numpy()
mean_array = mean_vec[0]
std_array = std_vec[0]


### standardizing wrt train data distribution

In [20]:
for i in range(len(test_data)):
    test_data.iloc[i,:] = np.divide(np.subtract(test_data.iloc[i,:],mean_array),std_array) 

In [17]:
pca = PCA(n_components=k)
pca.fit(scaled_train)


In [26]:
train_pc = pca.transform(scaled_train)


In [18]:
test_pc = pca.transform(test_data)



In [20]:
principalDf = pd.DataFrame(data = test_pc
             , columns = ['pc 1', 'pc 2','pc 3','pc 4','pc 5','pc 6','pc 7','pc 8','pc 9','pc 10','pc 11','pc 12','pc 13','pc 14','pc 15','pc 16','pc 17','pc 18','pc 19','pc 20','pc 21','pc 22','pc 23','pc 24'])

In [None]:
pd.DataFrame.to_csv(principalDf,'/home/aban/somu/review-2/pca_test.csv')