# Applying TabTranformers to OS fingerprinting task using nmap dataset

### Installing Python dependencies 

In [5]:
# !python -m pip install --upgrade pip 
# !pip install pandas numpy tensorflow matplotlib seaborn optuna scikit-learn tabtransformertf

In [6]:
import numpy as np
import pandas as pd

In [7]:
seed = 2024
np.random.seed(seed)

### Read dataset from disk

In [8]:
df = pd.read_csv("../dataset/dataset_no_encoded_4397.csv")

In [9]:
df.head()

Unnamed: 0,Class.vendor_0,Class.OSfamily_0,Class.OSgen_0,Class.device_0,SEQ.SP_0,SEQ.GCD_0,SEQ.ISR_0,SEQ.TI_0,SEQ.TI_1,SEQ.CI_0,...,U1.RIPCK_0,U1.RUCK_0,U1.RUCK_1,U1.RUD_0,IE.R_0,IE.DFI_0,IE.T_0,IE.TG_0,IE.CD_0,IE.CD_1
0,Linux,Linux,,WAP,255,3,261,I,-1,NO,...,NO,NO,-1,NO,NO,N,128,128,S,-1
1,Linux,Linux,2.4.X,specialized,207,5,207,Z,-1,Z,...,G,G,-1,G,NO,N,63,64,S,-1
2,Linux,Linux,2.6.X,specialized,0,64001,155,I,-1,I,...,G,G,-1,G,NO,S,67,64,Z,-1
3,Linux,Linux,2.6.X,specialized,1,128002,151,I,-1,I,...,G,G,-1,G,NO,S,65,64,Z,-1
4,Linux,Linux,2.6.X,specialized,5,192003,158,I,-1,I,...,G,G,-1,G,NO,S,61,64,Z,-1


In [10]:
df["Class.vendor_0"].value_counts()

Class.vendor_0
Linux        1510
Microsoft    1314
Apple         473
FreeBSD       421
Cisco         224
OpenBSD       180
Google        121
Sun           108
NetBSD         28
Oracle         18
Name: count, dtype: int64

In [11]:
df["Class.OSfamily_0"].value_counts()

Class.OSfamily_0
Linux      1510
Windows    1314
BSD         629
iOS         452
macOS       245
Solaris     126
Android     121
Name: count, dtype: int64

In [12]:
df["Class.OSgen_0"].value_counts()

Class.OSgen_0
2.6.X               906
3.X                 403
XP                  374
4.X                 277
2003                220
12.X                206
2.4.X               144
5.X                 144
7                   143
6.X                 119
2008                112
9.X                 104
2000                103
10                  101
7.X                  96
8.X                  89
98                   85
NT                   57
2012                 56
11.X                 54
10.7.X               49
2.X                  44
Vista                35
8                    33
10.4.X               31
1.X                  31
10.X                 30
10.6.X               28
10.5.X               26
15.X                 25
11                   23
8.1                  20
10.11.X              15
10.9.X               15
10.10.X              15
95                   12
10.8.X               12
9                    11
4.1.X                11
10.3.X               11
Me                    9
10

In [13]:
pair_counts = df.groupby(['Class.OSfamily_0', 'Class.OSgen_0']).size().reset_index(name='Count')
print(pair_counts)

   Class.OSfamily_0 Class.OSgen_0  Count
0           Android           1.X     10
1           Android           2.X     15
2           Android           3.X      1
3           Android         4.0.X      3
4           Android         4.1.X     11
..              ...           ...    ...
83            macOS        10.7.X     49
84            macOS        10.8.X     12
85            macOS        10.9.X     15
86            macOS           8.X     10
87            macOS           9.X      1

[88 rows x 3 columns]


In [14]:
df["Class.device_0"].value_counts()

Class.device_0
generalpurpose      3326
phone                308
mediadevice          137
WAP                  133
switch               113
storage-misc         107
router               102
broadbandrouter       46
firewall              41
specialized           30
webcam                20
PBX                    7
terminal               7
remotemanagement       5
VoIPphone              4
terminalserver         2
loadbalancer           2
proxyserver            2
security-misc          2
PDA                    1
telecom-misc           1
printer                1
Name: count, dtype: int64

In [15]:
pair_counts = df.groupby(['Class.OSfamily_0', 'Class.OSgen_0', "Class.device_0"]).size().reset_index(name='Count')
print(pair_counts)

    Class.OSfamily_0 Class.OSgen_0  Class.device_0  Count
0            Android           1.X           phone     10
1            Android           2.X           phone     15
2            Android           3.X           phone      1
3            Android         4.0.X           phone      3
4            Android         4.1.X       VoIPphone      1
..               ...           ...             ...    ...
167            macOS        10.7.X     mediadevice      8
168            macOS        10.8.X  generalpurpose     12
169            macOS        10.9.X  generalpurpose     15
170            macOS           8.X  generalpurpose     10
171            macOS           9.X  generalpurpose      1

[172 rows x 4 columns]


In [16]:
pair_counts = df.groupby(['Class.OSfamily_0', "Class.device_0"]).size().reset_index(name='Count')
print(pair_counts)

   Class.OSfamily_0    Class.device_0  Count
0           Android         VoIPphone      2
1           Android       mediadevice     11
2           Android             phone    108
3               BSD          firewall     10
4               BSD    generalpurpose    563
5               BSD      storage-misc     56
6             Linux               PBX      7
7             Linux               PDA      1
8             Linux         VoIPphone      2
9             Linux               WAP    110
10            Linux   broadbandrouter     43
11            Linux          firewall     31
12            Linux    generalpurpose   1097
13            Linux      loadbalancer      2
14            Linux       mediadevice     32
15            Linux             phone     61
16            Linux           printer      1
17            Linux       proxyserver      2
18            Linux  remotemanagement      5
19            Linux            router     14
20            Linux     security-misc      1
21        

In [17]:
df.pop('Class.vendor_0')
df.pop('Class.OSgen_0')
df.pop('Class.device_0')

df.reset_index(drop=True, inplace=True)

# header = names of columns
print(list(df.columns))

['Class.OSfamily_0', 'SEQ.SP_0', 'SEQ.GCD_0', 'SEQ.ISR_0', 'SEQ.TI_0', 'SEQ.TI_1', 'SEQ.CI_0', 'SEQ.CI_1', 'SEQ.II_0', 'SEQ.II_1', 'SEQ.SS_0', 'SEQ.TS_0', 'SEQ.TS_1', 'OPS.O1_0', 'OPS.O1_1', 'OPS.O1_2', 'OPS.O1_3', 'OPS.O1_4', 'OPS.O1_5', 'OPS.O1_6', 'OPS.O1_7', 'OPS.O1_8', 'OPS.O2_0', 'OPS.O2_1', 'OPS.O2_2', 'OPS.O2_3', 'OPS.O2_4', 'OPS.O2_5', 'OPS.O2_6', 'OPS.O2_7', 'OPS.O2_8', 'OPS.O3_0', 'OPS.O3_1', 'OPS.O3_2', 'OPS.O3_3', 'OPS.O3_4', 'OPS.O3_5', 'OPS.O3_6', 'OPS.O3_7', 'OPS.O3_8', 'OPS.O3_9', 'OPS.O3_10', 'OPS.O3_11', 'OPS.O4_0', 'OPS.O4_1', 'OPS.O4_2', 'OPS.O4_3', 'OPS.O4_4', 'OPS.O4_5', 'OPS.O4_6', 'OPS.O4_7', 'OPS.O4_8', 'OPS.O5_0', 'OPS.O5_1', 'OPS.O5_2', 'OPS.O5_3', 'OPS.O5_4', 'OPS.O5_5', 'OPS.O5_6', 'OPS.O5_7', 'OPS.O5_8', 'OPS.O6_0', 'OPS.O6_1', 'OPS.O6_2', 'OPS.O6_3', 'OPS.O6_4', 'OPS.O6_5', 'OPS.O6_6', 'WIN.W1_0', 'WIN.W2_0', 'WIN.W3_0', 'WIN.W4_0', 'WIN.W5_0', 'WIN.W6_0', 'ECN.R_0', 'ECN.DF_0', 'ECN.T_0', 'ECN.TG_0', 'ECN.W_0', 'ECN.O_0', 'ECN.O_1', 'ECN.O_2', 'ECN.O_3'

In [18]:
# no of features (X)
print("Nº features=", len(list(df.columns))-1)

Nº features= 259


In [19]:
# output name
OutVar = list(df.columns)[0]
print("Output=", OutVar)

Output= Class.OSfamily_0


### Checking data

In [20]:
def DataCheckings(df):
    # Check the number of data points in the data set
    print("\nData points =", len(df))
    
    # Check the number of columns in the data set
    print("\nColumns (output + features)=",len(df.columns))
    
    # Check the data types
    print("\nData types =", df.dtypes.unique())
    
    # Dataset statistics
    print('\n')
    df.describe()
    
    # print names of columns
    print('Column Names:\n', df.columns)
    
    # see if there are categorical data
    print("\nCategorical features:", df.select_dtypes(include=['O']).columns.tolist())
    
    # Check NA values
    # Check any number of columns with NaN
    print("\nColumns with NaN: ", df.isnull().any().sum(), ' / ', len(df.columns))

    # Check any number of data points with NaN
    print("\nNo of data points with NaN:", df.isnull().any(axis=1).sum(), ' / ', len(df))

In [21]:
DataCheckings(df)


Data points = 4397

Columns (output + features)= 260

Data types = [dtype('O') dtype('int64')]


Column Names:
 Index(['Class.OSfamily_0', 'SEQ.SP_0', 'SEQ.GCD_0', 'SEQ.ISR_0', 'SEQ.TI_0',
       'SEQ.TI_1', 'SEQ.CI_0', 'SEQ.CI_1', 'SEQ.II_0', 'SEQ.II_1',
       ...
       'U1.RIPCK_0', 'U1.RUCK_0', 'U1.RUCK_1', 'U1.RUD_0', 'IE.R_0',
       'IE.DFI_0', 'IE.T_0', 'IE.TG_0', 'IE.CD_0', 'IE.CD_1'],
      dtype='object', length=260)

Categorical features: ['Class.OSfamily_0', 'SEQ.TI_0', 'SEQ.CI_0', 'SEQ.II_0', 'SEQ.SS_0', 'SEQ.TS_0', 'OPS.O1_0', 'OPS.O1_1', 'OPS.O1_2', 'OPS.O1_3', 'OPS.O1_4', 'OPS.O1_5', 'OPS.O1_6', 'OPS.O1_7', 'OPS.O1_8', 'OPS.O2_0', 'OPS.O2_1', 'OPS.O2_2', 'OPS.O2_3', 'OPS.O2_4', 'OPS.O2_5', 'OPS.O2_6', 'OPS.O2_7', 'OPS.O2_8', 'OPS.O3_0', 'OPS.O3_1', 'OPS.O3_2', 'OPS.O3_3', 'OPS.O3_4', 'OPS.O3_5', 'OPS.O3_6', 'OPS.O3_7', 'OPS.O3_8', 'OPS.O3_9', 'OPS.O3_10', 'OPS.O3_11', 'OPS.O4_0', 'OPS.O4_1', 'OPS.O4_2', 'OPS.O4_3', 'OPS.O4_4', 'OPS.O4_5', 'OPS.O4_6', 'OPS.O4_7', 'OPS

In [22]:
print('Shape before removing duplicates=', df.shape)

Shape before removing duplicates= (4397, 260)


In [23]:
# remove duplicates!
df.drop_duplicates(keep=False, inplace=True)

In [24]:
print('Shape after removing duplicates=', df.shape)

Shape after removing duplicates= (4397, 260)


### Remove near zero variance features

In [25]:
# from sklearn.utils import class_weight
# from sklearn.feature_selection import VarianceThreshold

In [26]:
# def getDataFromDataFrame(df, OutVar):
#     # get X, Y data and column names from df
#     print('\n-> Get X & Y data, Features list')
#     print('Shape', df.shape)
    
#     # select X and Y
#     ds_y = df[OutVar]
#     ds_X = df.drop(OutVar,axis = 1)
#     Xdata = ds_X.values # get values of features
#     Ydata = ds_y.values # get output values

#     print('Shape X data:', Xdata.shape)
#     print('Shape Y data:', Ydata.shape)
    
#     # return data for X and Y, feature names as list
#     print('Done!')
#     return (Xdata, Ydata, list(ds_X.columns))

# def Remove0VarCols(df, OutVar):
#     Xdata, Ydata, Features = getDataFromDataFrame(df,OutVar=OutVar)# out var = Class 
#     print('\n-> Remove zero variance features')
#     # print('Initial features:', Features)
#     selector= VarianceThreshold()
#     Xdata = selector.fit_transform(Xdata)
#     # Selected features
#     SelFeatures = []
#     for i in selector.get_support(indices=True):
#         SelFeatures.append(Features[i])
#     print('Removed features:',list(set(Features) - set(SelFeatures)))
    
#     # create the resulted dataframe
#     df = pd.DataFrame(Xdata,columns=SelFeatures)
#     df[OutVar] = Ydata # add class column
#     # print('Final columns:', list(df.columns))
#     print('Done!')
#     return df

In [27]:
# df = Remove0VarCols(df, OutVar)

In [28]:
# # print dimension AFTER removing features
# print("Dataset dimension AFTER removing near zero variance features=",df.shape)

In [29]:
# df.columns

### Verify the classes ballance

In [30]:
df[OutVar].value_counts()

Class.OSfamily_0
Linux      1510
Windows    1314
BSD         629
iOS         452
macOS       245
Solaris     126
Android     121
Name: count, dtype: int64

In [31]:
df.replace(['BSD', 'iOS', 'macOS', 'Solaris', 'Android'], 'Other', inplace=True)
df.reset_index(drop=True, inplace=True)

In [32]:
df[OutVar].value_counts()

Class.OSfamily_0
Other      1573
Linux      1510
Windows    1314
Name: count, dtype: int64

### TabTransformers

#### Libraries Import

In [33]:
import tensorflow as tf
from keras.optimizers import AdamW
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
import absl.logging
import warnings
import logging

from tabtransformertf.models.tabtransformer import TabTransformer
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep

logging.captureWarnings(True)
warnings.filterwarnings('ignore')
absl.logging.set_verbosity(absl.logging.ERROR)

#### Preprocessing

In [35]:
train_data, test_data = train_test_split(df, stratify=df[OutVar], test_size=0.20, random_state=seed)

In [36]:
train_data.shape, test_data.shape

((3517, 260), (880, 260))

In [37]:
LABEL = OutVar

NUMERIC_FEATURES = df.select_dtypes(include=['int64']).columns.tolist()
CATEGORICAL_FEATURES = df.select_dtypes(include=['object']).columns.tolist()
CATEGORICAL_FEATURES.remove(LABEL)

FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES)

In [38]:
print(len(NUMERIC_FEATURES), len(CATEGORICAL_FEATURES), len(FEATURES))

53 206 259


#### Numeric Cleaning

In [39]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_pipe = Pipeline([
    ('impute', imputer),
    ('scale', scaler),
])

numeric_pipe.fit(train_data[NUMERIC_FEATURES])

In [40]:
train_data[NUMERIC_FEATURES] = numeric_pipe.transform(train_data[NUMERIC_FEATURES])
test_data[NUMERIC_FEATURES] = numeric_pipe.transform(test_data[NUMERIC_FEATURES])

#### Dtypes

In [41]:
train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype(str)
test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype(str)

train_data[NUMERIC_FEATURES] = train_data[NUMERIC_FEATURES].astype(int)
test_data[NUMERIC_FEATURES] = test_data[NUMERIC_FEATURES].astype(int)

#### Category Lookup Layers

In [42]:
category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURES)
category_prep_layers

100%|██████████| 206/206 [00:00<00:00, 327.38it/s]


{'SEQ.TI_0': <StringLookup name=string_lookup, built=False>,
 'SEQ.CI_0': <StringLookup name=string_lookup_1, built=False>,
 'SEQ.II_0': <StringLookup name=string_lookup_2, built=False>,
 'SEQ.SS_0': <StringLookup name=string_lookup_3, built=False>,
 'SEQ.TS_0': <StringLookup name=string_lookup_4, built=False>,
 'OPS.O1_0': <StringLookup name=string_lookup_5, built=False>,
 'OPS.O1_1': <StringLookup name=string_lookup_6, built=False>,
 'OPS.O1_2': <StringLookup name=string_lookup_7, built=False>,
 'OPS.O1_3': <StringLookup name=string_lookup_8, built=False>,
 'OPS.O1_4': <StringLookup name=string_lookup_9, built=False>,
 'OPS.O1_5': <StringLookup name=string_lookup_10, built=False>,
 'OPS.O1_6': <StringLookup name=string_lookup_11, built=False>,
 'OPS.O1_7': <StringLookup name=string_lookup_12, built=False>,
 'OPS.O1_8': <StringLookup name=string_lookup_13, built=False>,
 'OPS.O2_0': <StringLookup name=string_lookup_14, built=False>,
 'OPS.O2_1': <StringLookup name=string_lookup_15, bu

#### To TF Dataset

In [43]:
train_dataset = df_to_dataset(train_data[FEATURES + [LABEL]], LABEL)
test_dataset = df_to_dataset(test_data[FEATURES], None, shuffle=False)

ValueError: Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer supported. Convert to a numpy array before indexing instead.

#### Hyperparameter Tuning

In [None]:
def objective(trial):
    embedding_dim = trial.suggest_categorical('embedding_dim',[8, 16, 32, 64])
    depth = trial.suggest_int('depth',1,6,1)
    heads = trial.suggest_int('heads',2,8,1)
    attn_dropout = trial.suggest_float("attn_dropout", 0.05, 0.5)
    ff_dropout = trial.suggest_float("ff_dropout", 0.05, 0.5)
    mlp_hidden_factor1 = trial.suggest_int("mlp_hidden_factor1", 1, 3, 0.5)
    mlp_hidden_factor2 = trial.suggest_int("mlp_hidden_factor2", 1, 3, 0.5)
    use_column_embedding = trial.suggest_categorical('use_column_embedding', [True, False])
    
    category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURES)
    
    tabtransformer = TabTransformer(
        numerical_features = NUMERIC_FEATURES,
        categorical_features = CATEGORICAL_FEATURES,
        categorical_lookup=category_prep_layers,
        numerical_discretisers=None, # simply passing the numeric features
        embedding_dim=embedding_dim,
        out_dim=1,
        out_activation='sigmoid',
        depth=depth,
        heads=heads,
        attn_dropout=attn_dropout,
        ff_dropout=ff_dropout,
        mlp_hidden_factors=[mlp_hidden_factor1, mlp_hidden_factor2],
        use_column_embedding=use_column_embedding,
    )
    
    LEARNING_RATE = 0.001
    WEIGHT_DECAY = 0.0001
    NUM_EPOCHS = 1000

    optimizer = AdamW(
            learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
        )

    tabtransformer.compile(
        optimizer = optimizer,
        loss = tf.keras.losses.BinaryCrossentropy(),
        metrics= [tf.keras.metrics.AUC(name="AUC", curve='ROC')],
    )
    
    early = EarlyStopping(monitor="val_loss", mode="min", patience=20, restore_best_weights=True)
    callback_list = [early]

    history = tabtransformer.fit(
        train_dataset, 
        epochs=NUM_EPOCHS, 
        validation_data=test_dataset,
        callbacks=callback_list,
        verbose=0
    )
    
    val_preds = tabtransformer.predict(test_dataset)
    roc = roc_auc_score(test_dataset[LABEL], val_preds.ravel())
    
    gc.collect()
    
    return roc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

#### Training

In [None]:
tabtransformer = TabTransformer(
        numerical_features = NUMERIC_FEATURES,
        categorical_features = CATEGORICAL_FEATURES,
        categorical_lookup=category_prep_layers,
        numerical_discretisers=None, # simply passing the numeric features
        embedding_dim=32,
        out_dim=1,
        out_activation='sigmoid',
        depth=6,
        heads=5,
        attn_dropout=0.087687,
        ff_dropout=0.429539,
        mlp_hidden_factors=[1, 1],
        use_column_embedding=False,
    )

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

tabtransformer.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics= [tf.keras.metrics.AUC(name="AUC", curve='ROC')],
)

early = EarlyStopping(monitor="val_loss", mode="min", patience=20, restore_best_weights=True)
callback_list = [early]

history = tabtransformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list,
    verbose=1,
)

### Classic ML

In [None]:
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
def set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(option, np.unique(y_data), y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w

In [None]:
class_weights = set_weights(Ydata)

In [None]:
print("Classes=",class_weights)

In [None]:
# define a list of classifiers to train as baseline classifiers
classifiers = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights),
    BaggingClassifier(n_jobs=-1,random_state=seed),
]

In [None]:
# training and metrics (ACC, precision, recall, f1score) for a classifier
def ML_baseline(cls, X_tr, y_tr, X_ts, y_ts, seed=42, classes=['0','1']):
    ACC = 0
    AUROC = 0
    precision = 0 
    recall = 0
    f1score = 0
    
    cls_name = type(cls).__name__
    
    start_time = time.time()
    cls.fit(X_tr, y_tr) # TRAINING!
    print('\n---->', "training: %0.2f mins \n\n" % ((time.time() - start_time)/60))
    
    # predictions
    y_pred  = cls.predict(X_ts)             # predict classes
    y_probs = cls.predict_proba(X_ts)[:, 1] # predict probabilities of classes
    cls_rep = classification_report(y_ts, y_pred, target_names=classes,
                                    output_dict=True, digits=3)
    # print classification report
    #print(cls_rep)
    
    ACC       = accuracy_score(y_ts, y_pred)
    #AUROC     = roc_auc_score(y_ts, y_probs) # this is working for 2-classes classification only!!!
    precision = cls_rep['weighted avg']['precision']
    recall    = cls_rep['weighted avg']['recall']
    f1score   = cls_rep['weighted avg']['f1-score']  
    
    # print metrics
    print("\n", "ACC=", ACC, "precision=", precision, "recall=", recall, "f1score=",f1score)
    
    return cls, ACC, precision, recall, f1score

In [None]:
# create a dataframe for ML baseline
df_ML = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])

for cls in classifiers:
    print("\n**********************************\n", cls_name = type(cls).__name__)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=['1','2','3','4','5','6','7'])
    df_ML = df_ML.append({'Method': str(type(cls).__name__),
                          'ACC': float(ACC),
                          #'AUROC': float(AUROC),
                          'precision': float(precision),
                          'recall': float(recall),
                          'f1-score': float(f1score)}, ignore_index=True)

df_ML

In [None]:
df_ML.to_csv('ML_results.csv', index=False) # write to file the results

### Try a better classifier for the best ML method

We are using the best methods from baseline to find better hyperparameters for a better model.

In [None]:
# out best model was RF:
cls=RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights)

In [None]:
# check all the parameters
cls.get_params()

In [None]:
# define a list of classifiers to train with different params
classifiers = [
    RandomForestClassifier(n_estimators=10, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=20, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=50, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=100, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=200, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=300, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state=seed,class_weight=class_weights),
]

In [None]:
# create a dataframe for ML baseline
df_ML2 = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])
df_ML2

for cls in classifiers:
    print("\n**********************************\n", cls)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=['Android', 'BSD', 'Linux', 'Solaris', 'Windows', 'iOS', 'macOS'])
    df_ML2 = df_ML2.append({'Method': str(type(cls).__name__)+'-NoTrees='+str(cls.get_params()['n_estimators']),
                            'ACC': float(ACC),
                            #'AUROC': float(AUROC),
                            'precision': float(precision),
                            'recall': float(recall),
                            'f1-score': float(f1score)}, ignore_index=True)


In [None]:
df_ML2

In [None]:
df_ML2.to_csv('ML_results_best1.csv', index=False)

In [None]:
# list(cls.classes_)

### Grid search - search for the best params

In [None]:
paramsx = {'bootstrap': [True, False],
           'max_depth': [10, 20, 30, 40, 50, None],
           'max_features': ['auto', 'sqrt'],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'n_estimators': [50]
          }

In [None]:
forest= RandomForestClassifier(random_state=seed,class_weight=class_weights)

In [None]:
gridF = GridSearchCV(forest, paramsx, cv = 3, verbose = 2, n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
bestF.best_params_ # params of the best model

In [None]:
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    print(confusion_matrix(y_test,y_pred))

In [None]:
best_grid = bestF.best_estimator_ # the best model from grid search

evaluate(best_grid,X_test,y_test)

### Feature importance

In [None]:
# calculate ACC
y_pred=clf.predict(X_test)
print(list(clf.classes_))
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

feature_imp = pd.Series(clf.feature_importances_,index=df.columns[:-1]).sort_values(ascending=False)
feature_imp[:30]