## ADaS: Final Feature Selection Process
### Import Dataset

The dataset is first imported in a raw format. Any columns with NaN values are deemed incomplete and dropped. It is also possible to tweak this such that a column with NaN value is set to a placeholder value, but this requires context of that particular column.


In [15]:
# Import data csv file
import pandas as pd
import numpy as np
#df3 = pd.read_csv("../data/real-world/data-cleaning/cleaned_NB15_1_sub1.csv")
#df3 = pd.read_csv("../data/real-world/data-cleaning/cleaned_iot23_RW21.csv")
#df3 = pd.read_csv("../data/real-world/data-cleaning/cleaned_kdd_sample_1.csv")
#df3 = pd.read_csv("../data/real-world/data-cleaning/cleaned_nsl_kdd_sample_1.csv")
df3 = pd.read_csv("../data/real-world/data-cleaning/cleaned_UNSW_2018_IoT_Botnet_sample1")

print("# Starting Columns:", str(df3.shape[1]))
original_columns = df3.columns.tolist()

# nan_columns = df3.columns[df3.isna().any()].tolist()
# print("Columns with NaN values:", nan_columns)

# df3.dropna(axis=1, inplace=True) # remove columns with NaN values
# print("Columns with Values:", str(df3.shape[1]))

# replace blanks with NaN
df3.replace("", np.nan, inplace=True)

# Drop columns where ALL values are NaN
df3.dropna(axis=1, how='all', inplace=True)

# Drop rows where ANY value is NaN
df3.dropna(axis=0, how='any', inplace=True)

# Identify which columns were dropped
dropped_columns = list(set(original_columns) - set(df3.columns))
print("Columns with NaN values:", dropped_columns)
print("# Columns with Values:", str(df3.shape[1]))
df3.head(10)



# Starting Columns: 46
Columns with NaN values: []
# Columns with Values: 46


Unnamed: 0,pkSeqID,stime,flgs,flgs_number,proto,proto_number,saddr,sport,daddr,dport,...,AR_P_Proto_P_DstIP,N_IN_Conn_P_DstIP,N_IN_Conn_P_SrcIP,AR_P_Proto_P_Sport,AR_P_Proto_P_Dport,Pkts_P_State_P_Protocol_P_DestIP,Pkts_P_State_P_Protocol_P_SrcIP,attack,category,subcategory
0,719065,1528085000.0,e,1,udp,3,192.168.100.149,18033,192.168.100.5,80,...,0.240143,100,100,0.240143,0.240143,600,600,1,DoS,UDP
1,62293,1528081000.0,e s,2,tcp,1,192.168.100.147,33006,192.168.100.7,80,...,0.156066,100,100,0.156055,0.156066,500,500,1,DoS,TCP
2,144098,1528081000.0,e s,2,tcp,1,192.168.100.147,54158,192.168.100.7,80,...,0.159741,100,100,0.159664,0.159741,400,400,1,DoS,TCP
3,656283,1528085000.0,e,1,udp,3,192.168.100.147,28798,192.168.100.7,80,...,0.34246,100,100,0.342431,0.34246,1000,1000,1,DoS,UDP
4,417010,1528081000.0,e s,2,tcp,1,192.168.100.147,27109,192.168.100.7,80,...,0.223143,50,50,0.223144,0.219285,250,250,1,DoS,TCP
5,79142,1528081000.0,e s,2,tcp,1,192.168.100.148,25568,192.168.100.6,80,...,0.195269,100,100,0.160319,0.195269,393,393,1,DoS,TCP
6,771149,1528085000.0,e,1,udp,3,192.168.100.150,41309,192.168.100.3,80,...,0.438314,68,68,0.438314,0.389678,748,748,1,DoS,UDP
7,563045,1528081000.0,e s,2,tcp,1,192.168.100.148,58359,192.168.100.6,80,...,0.093723,97,97,0.08864,0.096041,441,441,1,DoS,TCP
8,362921,1528081000.0,e s,2,tcp,1,192.168.100.148,1246,192.168.100.6,80,...,0.259086,68,68,0.205076,0.240533,179,179,1,DoS,TCP
9,817115,1528085000.0,e,1,udp,3,192.168.100.150,56657,192.168.100.3,80,...,0.454138,90,90,0.454138,0.445182,990,990,1,DoS,UDP


### Sampling (as needed)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data

# Define the sample size
desired_sample_size = 4000
sample_fraction = desired_sample_size / len(df3)

# Perform a stratified split to get a representative sample
_, sample = train_test_split(
    df3,
    test_size=sample_fraction,
    stratify=df3['attack'],
    random_state=42
)

# print(sample['attack'].value_counts(normalize=True))  # verify proportions
# print(sample.shape)  # should be close to (4000, n_columns)
print(sample.head(10))
# sample.to_csv('../data/real-world/raw/UNSW_2018_IoT_Botnet_sample1', index=False) # output new CSV, rename!


        pkSeqID         stime flgs  flgs_number proto  proto_number  \
719064   719065  1.528085e+09    e            1   udp             3   
62292     62293  1.528081e+09  e s            2   tcp             1   
144097   144098  1.528081e+09  e s            2   tcp             1   
656282   656283  1.528085e+09    e            1   udp             3   
417009   417010  1.528081e+09  e s            2   tcp             1   
79141     79142  1.528081e+09  e s            2   tcp             1   
771148   771149  1.528085e+09    e            1   udp             3   
563044   563045  1.528081e+09  e s            2   tcp             1   
362920   362921  1.528081e+09  e s            2   tcp             1   
817114   817115  1.528085e+09    e            1   udp             3   

                  saddr  sport          daddr dport  ...  AR_P_Proto_P_DstIP  \
719064  192.168.100.149  18033  192.168.100.5    80  ...            0.240143   
62292   192.168.100.147  33006  192.168.100.7    80  ...  

### Feature Selection Functions
These functions set up the use of Principal Component Analysis (PCA) to extract features.

- **qual_to_quant(df, col)** converts a qualitative column *col* in *df* to quantitative values.
- **get_top_n_idx(n, arr)** returns an array with the indexes of the top *n* values in *arr*
- **modified_get_features(data, top_n)** is the final feature extraction method. It returns a numpy array of the selected *top_n* features from *data* using PCA, using weighted importance of the features.

In [16]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder


'''Converts the qualitative column col in df to quantitative values'''
def qual_to_quant(df, col):
  # vals = df[col].unique()
  # num_vals = range(df[col].nunique())
  # mapping = {k:v for k,v in zip(vals, num_vals)}
  # df[col] = df[col].apply(lambda x: mapping[x])

  le = LabelEncoder()
  df[col] = le.fit_transform(df[col])

"Returns an array with the indexes of the top n values in arr"
def get_top_n_idx(n, arr):
  arr = np.abs(arr)
  top = np.argpartition(arr, -n)[-n:]
  return top

'''
Returns a numpy array of the selected features from data using PCA.

PCA: develops unspecified number of components to represent data
Feature Selection: getting the top_n features that have the highest weighted
importance across all components 
'''
def modified_get_features(data, top_n):
  copy = data.copy(deep=True)  # Make a copy of the original data to avoid modifying it
  feat_options = copy.drop(columns=MAIN).copy(deep=True)  # Drop columns like unique IDs that shouldn't be scaled

  # Standardize the data
  scaler = StandardScaler()
  scaler.fit(feat_options)
  scaled_data = scaler.transform(feat_options)

  pca = PCA(n_components=0.95)
  #pca = PCA()
  pca.fit(scaled_data)

  ### USE ABSOLUTE VALUE OF LOADINGS ONLY FOR FEATURE IMPORTANCE 
  loadings = np.abs(pca.components_)  # Get importance for each feature in each component
  # feature_importance = np.sum(loadings, axis=0)  # Sum the absolute loadings for each feature across all components

  ### USE WEIGHTED LOADINGS BY EXPLAINED VARIANCE FOR FEATURE IMPORTANCE
  weighted_loadings = np.abs(pca.components_) * pca.explained_variance_ratio_.reshape(-1, 1)
  feature_importance = np.sum(weighted_loadings, axis=0)


  # Get the indexes of the top n most important features based on summed importance
  top = get_top_n_idx(top_n, feature_importance)

  # Get the feature names for the most important features
  most_important_names = feat_options.columns[top]

  # Print the selected important features
  print("Most Important Features:", most_important_names.tolist())

  # Return the unique top features
  unique_feats = np.unique(most_important_names)
  print("Unique Features:", unique_feats.tolist())

  # Return the selected unique features
  return pca, unique_feats


### Return Selected Features
First, the unique identifiers (ex. uids, extra label details) and the target label (ex. label, Class) for the dataset must be specified. This is to ensure these columns are not considered as features, and only necessary when working with labeled data.

The processing of qualitative to quantitative is then performed. There is also an optional check to drop any columns where all data either has the same or all different labels after converting qualitative to quantitative.

Then, the features are selected, with the dataset and number of features specified.

In [None]:
# define unique identifiers & target label(s) for dataset
MAIN = ['attack',	'category', 'subcategory', 'pkSeqID' ] # replace as necessary
# NB15 = Label | IOT = label, detailed-label, uid | KDD'99 = Label | NSL-KDD = Class_Bin, Class, Difficulty Level | Bot-IoT = 'attack',	'category', 'subcategory', 'pkSeqID'
num_entries = df3.shape[0]
drop = []
for i in df3:
  if df3[i].dtype == 'O' and i not in MAIN: # qualitative
      qual_to_quant(df3, i)
      if df3[i].nunique() > (num_entries / 2) or df3[i].nunique() == 1: # if more than 1/2 of data points have a unique label OR all have same label, drop the column. can change this!
         drop.append(i)
print(drop)
cleaned_df3 = df3.drop(columns = drop)
qual_to_quant(cleaned_df3, MAIN[0])
print("Final Columns:", str(cleaned_df3.shape[1]))
print(cleaned_df3.head())
#cleaned_df3.to_csv('../data/real-world/data-cleaning/cleaned_Bot-IoT_sample1.csv', index=False) # output new CSV, rename!

pca, features = modified_get_features(cleaned_df3, 10)
features

[]
Final Columns: 46
   pkSeqID         stime  flgs  flgs_number  proto  proto_number  saddr  \
0   719065  1.528085e+09     0            1      1             3      2   
1    62293  1.528081e+09     2            2      0             1      0   
2   144098  1.528081e+09     2            2      0             1      0   
3   656283  1.528085e+09     0            1      1             3      0   
4   417010  1.528081e+09     2            2      0             1      0   

   sport  daddr  dport  ...  AR_P_Proto_P_DstIP  N_IN_Conn_P_DstIP  \
0  18033      3     80  ...            0.240143                100   
1  33006      5     80  ...            0.156066                100   
2  54158      5     80  ...            0.159741                100   
3  28798      5     80  ...            0.342460                100   
4  27109      5     80  ...            0.223143                 50   

   N_IN_Conn_P_SrcIP  AR_P_Proto_P_Sport  AR_P_Proto_P_Dport  \
0                100            0.240143   

array(['AR_P_Proto_P_DstIP', 'AR_P_Proto_P_Sport', 'AR_P_Proto_P_SrcIP',
       'flgs', 'ltime', 'mean', 'pkts', 'proto_number', 'rate', 'stime'],
      dtype=object)

In [None]:
set(cleaned_df3['trans_depth'])

{0, 1, 2}

In [60]:
len(pca.components_)
# cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# cumulative_variance

27

In [41]:
pca.explained_variance_ratio_

array([0.14562332, 0.10364392, 0.09611259, 0.0753618 , 0.06893709,
       0.06201735, 0.05355498, 0.04326523, 0.03887541, 0.03474895,
       0.02723667, 0.0226414 , 0.02090036, 0.01754601, 0.01627307,
       0.01480839, 0.0141595 , 0.01313107, 0.01260091, 0.0113274 ,
       0.01030094, 0.00966422, 0.00954413, 0.00894161, 0.00864408,
       0.00761874, 0.00627568])