In [34]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('balanced_dataset.csv')

In [4]:
print(data.head())
print(data.columns)

   Unnamed: 0  Unnamed: 0.1  SRC_ADD  DES_ADD  PKT_ID  FROM_NODE  TO_NODE  \
0           0             0      3.0    24.30  389693         21       23   
1           1             1     15.0    24.15  201196         23       24   
2           2             3     24.9     9.00  443135         23       21   
3           3             4     24.8     8.00  157335         23       21   
4           4             5     24.1     1.00  219350         21        1   

  PKT_TYPE  PKT_SIZE    FLAGS    ...        PKT_RATE  BYTE_RATE  PKT_AVG_SIZE  \
0      tcp      1540  -------    ...      328.240918   505490.0        1540.0   
1      tcp      1540  -------    ...      328.205808   505437.0        1540.0   
2      ack        55  -------    ...      328.064183    18043.5          55.0   
3      ack        55  -------    ...      328.113525    18046.2          55.0   
4      ack        55  -------    ...      328.297902    18056.4          55.0   

   UTILIZATION PKT_DELAY PKT_SEND_TIME  PKT_RESEVE

In [8]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.drop("Unnamed: 0.1", axis=1, inplace=True)

In [10]:
print(data.columns)
print(data.info())

Index(['SRC_ADD', 'DES_ADD', 'PKT_ID', 'FROM_NODE', 'TO_NODE', 'PKT_TYPE',
       'PKT_SIZE', 'FLAGS', 'FID', 'SEQ_NUMBER', 'NUMBER_OF_PKT',
       'NUMBER_OF_BYTE', 'NODE_NAME_FROM', 'NODE_NAME_TO', 'PKT_IN', 'PKT_OUT',
       'PKT_R', 'PKT_DELAY_NODE', 'PKT_RATE', 'BYTE_RATE', 'PKT_AVG_SIZE',
       'UTILIZATION', 'PKT_DELAY', 'PKT_SEND_TIME', 'PKT_RESEVED_TIME',
       'FIRST_PKT_SENT', 'LAST_PKT_RESEVED', 'PKT_CLASS'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 28 columns):
SRC_ADD             20000 non-null float64
DES_ADD             20000 non-null float64
PKT_ID              20000 non-null int64
FROM_NODE           20000 non-null int64
TO_NODE             20000 non-null int64
PKT_TYPE            20000 non-null object
PKT_SIZE            20000 non-null int64
FLAGS               20000 non-null object
FID                 20000 non-null int64
SEQ_NUMBER          20000 non-null int64
NUMBER_OF_PKT       20000 

In [15]:
# data.drop("Unnamed: 0",axis=1,inplace=True)
# data.drop("Unnamed: 0.1",axis=1,inplace=True)

dummy_pkt_type = pd.get_dummies(data['PKT_TYPE'])
data = pd.concat([data, dummy_pkt_type], axis = 1)

dummy_flags = pd.get_dummies(data['FLAGS'])
data = pd.concat([data, dummy_flags], axis = 1)

data.drop("NODE_NAME_FROM",axis=1,inplace=True)
data.drop("NODE_NAME_TO",axis=1,inplace=True)



Index(['SRC_ADD', 'DES_ADD', 'PKT_ID', 'FROM_NODE', 'TO_NODE', 'PKT_TYPE',
       'PKT_SIZE', 'FLAGS', 'FID', 'SEQ_NUMBER', 'NUMBER_OF_PKT',
       'NUMBER_OF_BYTE', 'PKT_IN', 'PKT_OUT', 'PKT_R', 'PKT_DELAY_NODE',
       'PKT_RATE', 'BYTE_RATE', 'PKT_AVG_SIZE', 'UTILIZATION', 'PKT_DELAY',
       'PKT_SEND_TIME', 'PKT_RESEVED_TIME', 'FIRST_PKT_SENT',
       'LAST_PKT_RESEVED', 'PKT_CLASS', 'ack', 'cbr', 'ping', 'tcp', '-------',
       '---A---'],
      dtype='object')

In [20]:
data.columns

Index(['SRC_ADD', 'DES_ADD', 'PKT_ID', 'FROM_NODE', 'TO_NODE', 'PKT_TYPE',
       'PKT_SIZE', 'FLAGS', 'FID', 'SEQ_NUMBER', 'NUMBER_OF_PKT',
       'NUMBER_OF_BYTE', 'PKT_IN', 'PKT_OUT', 'PKT_R', 'PKT_DELAY_NODE',
       'PKT_RATE', 'BYTE_RATE', 'PKT_AVG_SIZE', 'UTILIZATION', 'PKT_DELAY',
       'PKT_SEND_TIME', 'PKT_RESEVED_TIME', 'FIRST_PKT_SENT',
       'LAST_PKT_RESEVED', 'PKT_CLASS', 'ack', 'cbr', 'ping', 'tcp', '-------',
       '---A---'],
      dtype='object')

In [26]:
features = ['SRC_ADD', 'DES_ADD', 'PKT_ID', 'FROM_NODE', 'TO_NODE',
       'PKT_SIZE', 'FID', 'SEQ_NUMBER', 'NUMBER_OF_PKT',
       'NUMBER_OF_BYTE', 'PKT_IN', 'PKT_OUT', 'PKT_R', 'PKT_DELAY_NODE',
       'PKT_RATE', 'BYTE_RATE', 'PKT_AVG_SIZE', 'UTILIZATION', 'PKT_DELAY',
       'PKT_SEND_TIME', 'PKT_RESEVED_TIME', 'FIRST_PKT_SENT',
       'LAST_PKT_RESEVED','ack', 'cbr', 'ping', 'tcp', '-------',
       '---A---']
X = data[features].values
Y = data['PKT_CLASS']

print(X.shape)
print(Y.shape)


(20000, 29)
(20000,)


In [35]:
scalar = StandardScaler(copy=True, with_mean=True, with_std=True)
scalar.fit(X)
standardised_X = scalar.transform(X)

In [27]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [36]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(standardised_X, Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(standardised_X)
X_new.shape

(20000, 25)

In [37]:
print(X.shape)
print(X_new.shape)

(20000, 29)
(20000, 25)
