# Data cleaning and Feature Selection

### Constants

In [110]:
DF_COL_COUNT = len(df.columns)

### Reading Dataset & Dataset description

In [111]:
import pandas as pd

In [112]:
df = pd.read_csv('../data/interim/ton-iot-windows-10.csv')

In [113]:
df.head()

Unnamed: 0,ts,Processor_DPC_Rate,Processor_pct_ Idle_Time,Processor_pct_ C3_Time,Processor_pct_ Interrupt_Time,Processor_pct_ C2_Time,Processor_pct_ User_Time,Processor_pct_ C1_Time,Processor_pct_ Processor_Time,Processor_C1_ransitions_sec,...,LogicalDisk(_Total) Avg Disk Write Queue Length,LogicalDisk(_Total) Avg Disk Queue Length,LogicalDisk(_Total) pct_ Disk Read Time,LogicalDisk(_Total) Disk Write Bytes sec,LogicalDisk(_Total) Disk Transfers sec,LogicalDisk(_Total) Avg Disk Bytes Transfer,LogicalDisk(_Total) pct_ Disk Write Time,LogicalDisk(_Total) Avg Disk sec Transfer,label,type
0,1554206309,4,29.90817156,0,0.078240397,0,61.02750947,29.90817156,66.2001486,480.0949912,...,0.134876119,0.163098103,2.822198461,1924988.236,402.5827836,9100.481592,13.48761188,0.000405118,0,normal
1,1554206319,9,31.75168186,0,0.312520973,0,59.14459419,31.75168186,66.24773489,427.0412837,...,0.157564294,0.189279353,3.17150584,1497232.743,355.1343322,11153.55449,15.75642941,0.000532995,0,normal
2,1554206329,5,29.49516707,0,1.16822183,0,46.02794011,29.49516707,66.90038148,1159.426821,...,0.401773775,0.782471877,38.06981014,1854228.536,760.4317685,26994.26595,40.17737752,0.001028323,0,normal
3,1554206339,12,18.22437505,0,1.097191902,0,47.80621859,18.22437505,79.54520811,736.622011,...,0.259062331,0.768934004,50.98716725,15912390.29,699.4999538,53141.6638,25.90623311,0.001099429,0,normal
4,1554206349,12,14.86118688,0,1.562431019,0,41.8731513,14.86118688,82.73513724,896.1085985,...,0.103836186,1.081729712,97.7893526,23609299.7,1184.550425,40988.68444,10.38361856,0.000912738,0,normal


In [114]:
df.dtypes

ts                                               int64
Processor_DPC_Rate                               int64
Processor_pct_ Idle_Time                        object
Processor_pct_ C3_Time                          object
Processor_pct_ Interrupt_Time                   object
                                                 ...  
LogicalDisk(_Total) Avg  Disk Bytes Transfer    object
LogicalDisk(_Total) pct_ Disk Write Time        object
LogicalDisk(_Total) Avg  Disk sec Transfer      object
label                                            int64
type                                            object
Length: 127, dtype: object

### Data Cleaning

In [115]:
total_NaN_count = 0
for column in df.iloc[:, : DF_COL_COUNT - 2]:
    df[column] = pd.to_numeric(df[column], errors="coerce")
    total_NaN_count += df[column].isna().sum()

    column_mean = df[column].mean()
    df[column].fillna(value=column_mean, inplace=True)
print("Total cells replaced: {}".format(total_NaN_count))


Total cells replaced: 3360


In [116]:
df.dtypes

ts                                                int64
Processor_DPC_Rate                                int64
Processor_pct_ Idle_Time                        float64
Processor_pct_ C3_Time                          float64
Processor_pct_ Interrupt_Time                   float64
                                                 ...   
LogicalDisk(_Total) Avg  Disk Bytes Transfer    float64
LogicalDisk(_Total) pct_ Disk Write Time        float64
LogicalDisk(_Total) Avg  Disk sec Transfer      float64
label                                             int64
type                                             object
Length: 127, dtype: object

### Feature Selection

In [117]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [118]:
X = df.iloc[:, 0:125]
Y = df.iloc[:, -2]

In [119]:
fit = SelectKBest(score_func=chi2, k=10).fit(X, Y)

df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

featureScores = pd.concat([df_columns, df_scores], axis=1)
featureScores.columns = ["Features", "Score"]
featureScores.nlargest(30, "Score")


Unnamed: 0,Features,Score
29,Process_Virtual_Bytes,1.854902e+17
23,Process_Virtual_Bytes Peak,1.854822e+17
88,Memory System Driver Resident Bytes,1.424477e+16
93,Memory Free System Page Table Entries,2954608000000.0
67,Memory Free & Zero Page List Bytes,1606858000000.0
33,Process_Working Set,1290024000000.0
24,Process_Page_File Bytes Peak,1149956000000.0
71,Memory Commit Limit,928913400000.0
19,Process_Working_Set_Peak,392958200000.0
84,Memory Standby Cache Reserve Bytes,352559100000.0
