# Prediction of OS type using Machine Learning based on OS fingerprints

In [None]:
import numpy as np
import pandas as pd
import p0f_db_parser as parser

In [None]:
seed = 2022
np.random.seed(seed)

### Read dataset from disk

In [None]:
#df = pd.read_csv('./dataset.csv')

### Parse database and import dataset

In [None]:
dataset,column_names = parser.parse_database("p0f.fp")
df = pd.DataFrame(dataset,columns=column_names)
del dataset
del column_names

### Explore dataset

In [None]:
df.head()

In [None]:
# header = names of columns
print(list(df.columns))

In [None]:
# no of features (X)
print("Number of features = ", len(list(df.columns))-1)

In [None]:
# output name
print("Output=", list(df.columns)[1])

In [None]:
# define output variable name
OutVar = list(df.columns)[1]
print(OutVar)

### Checking data

In [None]:
def DataCheckings(df):
    # Check the number of data points in the data set
    print("\nData points =", len(df))
    
    # Check the number of columns in the data set
    print("\nColumns (output + features)=",len(df.columns))
    
    # Check the data types
    print("\nData types =", df.dtypes.unique())
    
    # List of values per column
    print()
    for column in df.columns:
        print(column + " -> ")
        print(df[column].value_counts())
        print()
    
    # Dataset statistics
    print('\n')
    df.describe()
    
    # print names of columns
    print('Column Names:\n', df.columns)
    
    # see if there are categorical data
    print("\nCategorical features:", df.select_dtypes(include=['O']).columns.tolist())
    
    # Check NA values
    # Check any number of columns with NaN
    print("\nColumns with NaN: ", df.isnull().any().sum(), ' / ', len(df.columns))

    # Check any number of data points with NaN
    print("\nNumber of data points with NaN:", df.isnull().any(axis=1).sum(), ' / ', len(df))

In [None]:
DataCheckings(df)

In [None]:
print('Shape before removing duplicates = ', df.shape)

In [None]:
# Remove duplicates!

df.drop_duplicates(keep=False, inplace=True)
print('Shape after removing duplicates=', df.shape)

### Encoding of the dataset

In [None]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Filter OSes
df = df[df.os.isin(['Linux', 'Windows', 'Mac OS X', 'Solaris', 'OpenBSD', 'FreeBSD'])]

df.reset_index(inplace=True, drop=True)

# Drop version column
df.drop('version', inplace=True, axis=1)

# Drop signature direction column
# df.drop('sig_direction', inplace=True, axis=1)

In [None]:
# Data Augmentation
# TTL, MSS & Windows size

ttl_factor = 10

array = df.to_numpy()
ttl_i = df.columns.get_loc('initial_ttl')

for row in array:
    for i in range(1,ttl_factor+1):
        new_row = row.copy()
        new_row[ttl_i] = row[ttl_i] - i
        
        array = np.vstack((array, new_row))
        
df = pd.DataFrame(array, columns = df.columns)

In [None]:
# TTL
# Numeric value (¿Normalization?)

In [None]:
# MSS
# Categorical encoding

encoder_mss = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')

In [None]:
# Window Size
# ??????????

df.drop('window_size', inplace=True, axis=1)

In [None]:
# Window Scaling 
# Categorical encoding

encoder_window_scaling = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')

# encoder_window_scaling.fit(df[['window_scaling']])

In [None]:
# TCP Options
# Custom transformer

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class TCPOptionsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name):
        self.feature_name = feature_name
        self.max_options = 0
        self.classes = []
        self.headers = {}
    
    def fit(self, X):
        column = X[self.feature_name]
        for row in column:
            values = row.split(',')
            if len(values) > self.max_options:
                self.max_options = len(values)
            for v in values:
                if v not in self.classes:
                    self.classes.append(v)
        self.headers = {self.feature_name+str(i):'*' for i in range(self.max_options)}
        return self
    
    def transform(self, X):
        X_2 = X.copy()
        X_2 = X_2.assign(**self.headers)
        
        row_i = 0
        for row in X_2[self.feature_name]:
            values = row.split(',')
            values_len = len(values)
            for i in range(values_len):
                if i < self.max_options:
                    X_2.at[row_i,self.feature_name+str(i)] = values[i]
            row_i += 1
            
        X_2.drop(self.feature_name, inplace=True, axis=1)
        return X_2
    
    def get_feature_names_out(self,names):
        return [self.feature_name+str(i) for i in range(self.max_options)]

encoder_tcp_options = TCPOptionsTransformer('tcp_options')
df = encoder_tcp_options.fit_transform(df)

encoder2_tcp_options = OneHotEncoder(categories=[encoder_tcp_options.classes]*encoder_tcp_options.max_options,sparse=False,handle_unknown='ignore')

# encoder2_tcp_options.fit(df[list(encoder_tcp_options.headers.keys())])
# encoder2_tcp_options.transform(df[list(encoder_tcp_options.headers.keys())])

In [None]:
# Quirks
# Categorical encoding (already encoded)

In [None]:
# Apply encodings

from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

df.reset_index(inplace=True, drop=True) 

encoders = make_column_transformer(
    (encoder_mss, ['mss']),
    (encoder_window_scaling, ['window_scaling']),
    (encoder2_tcp_options, make_column_selector(pattern='tcp_options')),
    remainder='passthrough',
    verbose_feature_names_out=False,
    n_jobs=1)

transformed = encoders.fit_transform(df)
transformed_df = pd.DataFrame(
    transformed,
    columns=encoders.get_feature_names_out()
)

df = transformed_df

# define output variable name
OutVar = df.os.name

In [None]:
df.head()

df = df[df.sig_direction.isin(['request'])]
df.drop('sig_direction', inplace=True, axis=1)

### Remove near zero variance features

In [None]:
from sklearn.utils import class_weight
from sklearn.feature_selection import VarianceThreshold

In [None]:
def getDataFromDataFrame(df, OutVar):
    # get X, Y data and column names from df
    print('\n-> Get X & Y data, Features list')
    print('Shape', df.shape)
    
    # select X and Y
    ds_y = df[OutVar]
    ds_X = df.drop(OutVar,axis = 1)
    Xdata = ds_X.values # get values of features
    Ydata = ds_y.values # get output values

    print('Shape X data:', Xdata.shape)
    print('Shape Y data:', Ydata.shape)
    
    # return data for X and Y, feature names as list
    print('Done!')
    return (Xdata, Ydata, list(ds_X.columns))

def Remove0VarCols(df, OutVar):
    Xdata, Ydata, Features = getDataFromDataFrame(df,OutVar=OutVar)# out var = Class 
    print('\n-> Remove zero variance features')
    # print('Initial features:', Features)
    selector= VarianceThreshold()
    Xdata = selector.fit_transform(Xdata)
    # Selected features
    SelFeatures = []
    for i in selector.get_support(indices=True):
        SelFeatures.append(Features[i])
    print('Removed features:',list(set(Features) - set(SelFeatures)))
    
    # create the resulted dataframe
    df = pd.DataFrame(Xdata,columns=SelFeatures)
    df[OutVar] = Ydata # add class column
    # print('Final columns:', list(df.columns))
    print('Done!')
    return df

In [None]:
# df = Remove0VarCols(df, OutVar)

In [None]:
# print dimension AFTER removing features
print("Dataset dimension AFTER removing near zero variance features=",df.shape)

In [None]:
df.columns

### Verify the classes ballance

In [None]:
df[OutVar].value_counts()

### Get data as arrays

In [None]:
# select X and Y
Ydata = df[OutVar].values                  # get values of features
Xdata = df.drop(OutVar,axis = 1).values    # get output values

print('Shape X data:', Xdata.shape)
print('Shape Y data:',Ydata.shape)

In [None]:
Ydata

### Data split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,
                                                    stratify=Ydata, 
                                                    test_size=0.10,
                                                    random_state=seed)

In [None]:
# verify dimentions of data for training and test
print('Shape X_train:', X_train.shape)
print('Shape X_test:' , X_test.shape)
print('Shape y_train:', y_train.shape)
print('Shape y_test:' , y_test.shape)

### ML

In [None]:
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
def set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(class_weight=option, classes=np.unique(y_data), y=y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w

In [None]:
class_weights = set_weights(Ydata)

In [None]:
print("Classes=",class_weights)

In [None]:
# define a list of classifiers to train as baseline classifiers
classifiers = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights),
    # BaggingClassifier(n_jobs=-1,random_state=seed)
]

In [None]:
# training and metrics (ACC, precision, recall, f1score) for a classifier
def ML_baseline(cls, X_tr, y_tr, X_ts, y_ts, seed=42, classes=['0','1']):
    ACC = 0
    AUROC = 0
    precision = 0 
    recall = 0
    f1score = 0
    
    cls_name = type(cls).__name__
    
    start_time = time.time()
    cls.fit(X_tr, y_tr) # TRAINING!
    print('\n---->', "training: %0.2f mins \n\n" % ((time.time() - start_time)/60))
    
    # predictions
    y_pred  = cls.predict(X_ts)             # predict classes
    y_probs = cls.predict_proba(X_ts)[:, 1] # predict probabilities of classes
    cls_rep = classification_report(y_ts, y_pred, target_names=classes,
                                    output_dict=True, digits=3)
    # print classification report
    #print(cls_rep)
    
    ACC       = accuracy_score(y_ts, y_pred)
    #AUROC     = roc_auc_score(y_ts, y_probs) # this is working for 2-classes classification only!!!
    precision = cls_rep['weighted avg']['precision']
    recall    = cls_rep['weighted avg']['recall']
    f1score   = cls_rep['weighted avg']['f1-score']  
    
    # print metrics
    print("\n", "ACC=", ACC, "precision=", precision, "recall=", recall, "f1score=",f1score)
    
    return cls, ACC, precision, recall, f1score

In [None]:
# create a dataframe for ML baseline
df_ML = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])

classes_names = np.unique(Ydata)

for cls in classifiers:
    print("\n**********************************"+type(cls).__name__+"**********************************")
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=classes_names)
    df_ML = df_ML.append({'Method': str(type(cls).__name__),
                          'ACC': float(ACC),
                          #'AUROC': float(AUROC),
                          'precision': float(precision),
                          'recall': float(recall),
                          'f1-score': float(f1score)}, ignore_index=True)

df_ML

In [None]:
df_ML.to_csv('ML_results.csv', index=False) # write to file the results

### Try a better classifier for the best ML method

We are using the best methods from baseline to find better hyperparameters for a better model.

In [None]:
# out best model was RF:
cls=RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights)

In [None]:
# check all the parameters
cls.get_params()

In [None]:
# define a list of classifiers to train with different params
classifiers = [
    RandomForestClassifier(n_estimators=10, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=20, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=50, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=100, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=200, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=300, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state=seed,class_weight=class_weights),
]

In [None]:
# create a dataframe for ML baseline
df_ML2 = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])
df_ML2

for cls in classifiers:
    print("\n**********************************\n", cls)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=classes_names)
    df_ML2 = df_ML2.append({'Method': str(type(cls).__name__)+'-NoTrees='+str(cls.get_params()['n_estimators']),
                            'ACC': float(ACC),
                            #'AUROC': float(AUROC),
                            'precision': float(precision),
                            'recall': float(recall),
                            'f1-score': float(f1score)}, ignore_index=True)


In [None]:
df_ML2

In [None]:
df_ML2.to_csv('ML_results_best1.csv', index=False)

In [None]:
# list(cls.classes_)

### Grid search - search for the best params

In [None]:
paramsx = {'bootstrap': [True, False],
           'max_depth': [10, 20, 30, 40, 50, None],
           'max_features': ['auto', 'sqrt'],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'n_estimators': [50]
          }

In [None]:
forest= RandomForestClassifier(random_state=seed,class_weight=class_weights)

In [None]:
gridF = GridSearchCV(forest, paramsx, cv = 3, verbose = 2, n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
bestF.best_params_ # params of the best model

In [None]:
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    print(confusion_matrix(y_test,y_pred))

In [None]:
best_grid = bestF.best_estimator_ # the best model from grid search

evaluate(best_grid,X_test,y_test)

Classes= {'Android': 14.665116279069768, 'BSD': 2.649950973525704, 'Linux': 0.2654263826921458, 'Solaris': 7.6808769792935445, 'Windows': 0.5636396138720057, 'iOS': 2.195682451253482, 'macOS': 2.3413366336633663}

Linux      142548
Windows     67128
iOS         17232
macOS       16160
BSD         14278
Solaris      4926
Android      2580

### Feature importance

In [None]:
# calculate ACC
y_pred=clf.predict(X_test)
print(list(clf.classes_))
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

feature_imp = pd.Series(clf.feature_importances_,index=df.columns[:-1]).sort_values(ascending=False)
feature_imp[:30]