# OS Fingerprinting based on ML and p0f dataset

In [280]:
import numpy as np
import pandas as pd
import p0f_db_parser as parser

In [281]:
seed = 2022
np.random.seed(seed)

### Parse database and import dataset

In [282]:
dataset,column_names = parser.parse_database("p0f.fp")
df = pd.DataFrame(dataset,columns=column_names)
del dataset
del column_names

### Explore dataset

In [283]:
print("Initial dataset")
df.head()

Initial dataset


Unnamed: 0,sig_direction,os,version,initial_ttl,mss,window_size,window_scaling,tcp_options,quirk_df,quirk_id,quirk_ts
0,request,Linux,3.11 and newer,64,*,mss*20,10,"mss,sok,ts,nop,ws",1,1,0
1,request,Linux,3.11 and newer,64,*,mss*20,7,"mss,sok,ts,nop,ws",1,1,0
2,request,Linux,3.1-3.10,64,*,mss*10,4,"mss,sok,ts,nop,ws",1,1,0
3,request,Linux,3.1-3.10,64,*,mss*10,5,"mss,sok,ts,nop,ws",1,1,0
4,request,Linux,3.1-3.10,64,*,mss*10,6,"mss,sok,ts,nop,ws",1,1,0


In [284]:
# define output variable name
OutVar = list(df.columns)[1]

### Encoding of the dataset

In [285]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

#### Filter Operating Systems

In [286]:
# Filter OSes
df = df[df.os.isin(['Linux', 'Windows', 'Mac OS X', 'Solaris', 'OpenBSD', 'FreeBSD'])]

df.reset_index(inplace=True, drop=True)

# Drop version column
df.drop('version', inplace=True, axis=1)

df_test = df.copy()

#### Data Augmentation

In [287]:
# Data Augmentation
# TTL, MSS & Windows size

ttl_factor = 10

array = df.to_numpy()
ttl_i = df.columns.get_loc('initial_ttl')

for row in array:
    for i in range(1,ttl_factor+1):
        new_row = row.copy()
        new_row[ttl_i] = row[ttl_i] - i
        
        array = np.vstack((array, new_row))
        
df = pd.DataFrame(array, columns = df.columns)

#### TTL

In [288]:
# TTL
# Numeric value (¿Standarization, Normalization?)

#### MSS

In [289]:
# MSS
# Categorical encoding

encoder_mss = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')

#### Window Size

In [290]:
# Window Size
# Drop column

class WindowSizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X):
        return self 

    def transform(self,X):
        return X.drop(X.columns[0],axis=1)
    
    def get_feature_names_out(self,names='window_size'):
        # return [self.feature_name+str(i) for i in range(self.max_options)]
        return []

encoder_window_size = WindowSizeTransformer()

#### Windows Scaling

In [291]:
# Window Scaling 
# Categorical encoding

encoder_window_scaling = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')

# encoder_window_scaling.fit(df[['window_scaling']])

#### TCP options

In [292]:
# TCP Options
# Custom transformer

class TCPOptionsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.max_options = 0
        self.feature_name = ''
        self.classes = []
        self.headers = {}
        self.names_out = []
    
    def fit(self, X):
        X_2 = pd.DataFrame(X).reset_index(drop=True)
        self.feature_name = X_2.columns[0]
        for row in X_2[self.feature_name]:
            values = row.split(',')
            if len(values) > self.max_options:
                self.max_options = len(values)
            for v in values:
                if v not in self.classes:
                    self.classes.append(v)
        self.headers = {self.feature_name+str(i):'*' for i in range(self.max_options)}
        return self
    
    def transform(self, X):
        X_2 = pd.DataFrame(X).reset_index(drop=True)
        X_2 = X_2.assign(**self.headers)
        
        row_i = 0
        for row in X_2[self.feature_name]:
            values = row.split(',')
            values_len = len(values)
            for i in range(values_len):
                if i < self.max_options:
                    X_2.at[row_i,self.feature_name+str(i)] = values[i]
            row_i += 1
            
        X_2.drop(self.feature_name, inplace=True, axis=1)
        
        encoder2_tcp_options = OneHotEncoder(categories=[self.classes]*self.max_options,sparse=False,handle_unknown='ignore')
        
        encoder2_tcp_options.fit(X_2[list(self.headers.keys())])
        result = encoder2_tcp_options.transform(X_2[list(self.headers.keys())])
        
        self.names_out = encoder2_tcp_options.get_feature_names_out()
        
        return result
    
    def get_feature_names_out(self,names='tcp_options'):
        # return [self.feature_name+str(i) for i in range(self.max_options)]
        return self.names_out

encoder_tcp_options = TCPOptionsTransformer()

#### Quirks

In [293]:
# Quirks
# Categorical encoding (already encoded)

#### Applying encodings

In [None]:
# Apply encodings

from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

df.reset_index(inplace=True, drop=True)

encoders = make_column_transformer(
    (encoder_mss, ['mss']),
    (encoder_window_size, ['window_size']),
    (encoder_window_scaling, ['window_scaling']),
    (encoder_tcp_options, ['tcp_options']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = encoders.fit_transform(df)
transformed_df = pd.DataFrame(
    transformed,
    columns=encoders.get_feature_names_out()
)

df = transformed_df

# define output variable name
OutVar = df.os.name

In [None]:
# Remove duplicates

df = df.drop_duplicates()

#### Dataset split

In [None]:
# Create two datasets: requests and responses

df_request = df[df.sig_direction.isin(['request'])].drop('sig_direction', axis=1)
df_response = df[df.sig_direction.isin(['response'])].drop('sig_direction', axis=1)

df_request.reset_index(inplace=True, drop=True)
df_response.reset_index(inplace=True, drop=True)

del df

In [None]:
df_request.head()

In [None]:
df_response.head()

### Verify the classes ballance

In [None]:
df_request[OutVar].value_counts()

In [None]:
df_response[OutVar].value_counts()

### Get data as arrays

In [None]:
# Request

Ydata_request = df_request[OutVar].values                  # get values of features
Xdata_request = df_request.drop(OutVar,axis = 1).values    # get output values

print('Shape X data:', Xdata_request.shape)
print('Shape Y data:',Ydata_request.shape)

In [None]:
# Response

Ydata_response = df_response[OutVar].values                  # get values of features
Xdata_response = df_response.drop(OutVar,axis = 1).values    # get output values

print('Shape X data:', Xdata_response.shape)
print('Shape Y data:',Ydata_response.shape)

### Data split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Request

X_train_request, X_test_request, y_train_request, y_test_request = train_test_split(Xdata_request, Ydata_request,
                                                                    stratify=Ydata_request, 
                                                                    test_size=0.10,
                                                                    random_state=seed)

# verify dimentions of data for training and test
print('Shape X_train:', X_train_request.shape)
print('Shape X_test:' , X_test_request.shape)
print('Shape y_train:', y_train_request.shape)
print('Shape y_test:' , y_test_request.shape)

In [None]:
# Response

X_train_response, X_test_response, y_train_response, y_test_response = train_test_split(Xdata_response, Ydata_response,
                                                                        stratify=Ydata_response, 
                                                                        test_size=0.10,
                                                                        random_state=seed)

# verify dimentions of data for training and test
print('Shape X_train:', X_train_response.shape)
print('Shape X_test:' , X_test_response.shape)
print('Shape y_train:', y_train_response.shape)
print('Shape y_test:' , y_test_response.shape)

### ML

In [None]:
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

##### Classes balance

In [None]:
def set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(class_weight=option, classes=np.unique(y_data), y=y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w

class_weights_request = set_weights(Ydata_request)
class_weights_response = set_weights(Ydata_response)

print("Request balance => ",class_weights_request)
print("Response balance => ",class_weights_response)

##### Classifiers definition

In [None]:
# define a list of classifiers to train as baseline classifiers
classifiers_request = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights_request),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights_request),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights_request),
    BaggingClassifier(n_jobs=-1,random_state=seed)
]

classifiers_response = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights_response),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights_response),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights_response),
    BaggingClassifier(n_jobs=-1,random_state=seed)
]

##### Training

In [None]:
# training and metrics (ACC, precision, recall, f1score) for a classifier
def ML_baseline(cls, X_tr, y_tr, X_ts, y_ts, seed=42, classes=['0','1']):
    ACC = 0
    AUROC = 0
    precision = 0 
    recall = 0
    f1score = 0
    
    cls_name = type(cls).__name__
    
    start_time = time.time()
    cls.fit(X_tr, y_tr) # TRAINING!
    # print('\n---->', "training: %0.2f mins \n\n" % ((time.time() - start_time)/60))
    
    # predictions
    y_pred  = cls.predict(X_ts)             # predict classes
    y_probs = cls.predict_proba(X_ts)[:, 1] # predict probabilities of classes
    cls_rep = classification_report(y_ts, y_pred, target_names=classes,
                                    output_dict=True, digits=3)
    # print classification report
    #print(cls_rep)
    
    ACC       = accuracy_score(y_ts, y_pred)
    #AUROC     = roc_auc_score(y_ts, y_probs) # this is working for 2-classes classification only!!!
    precision = cls_rep['weighted avg']['precision']
    recall    = cls_rep['weighted avg']['recall']
    f1score   = cls_rep['weighted avg']['f1-score']  
    
    # print metrics
    # print("\n", "ACC=", ACC, "precision=", precision, "recall=", recall, "f1score=",f1score)
    
    return cls, ACC, precision, recall, f1score

In [None]:
# Request

# create a dataframe for ML baseline
statistics_ML_request = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])

classes_names = np.unique(Ydata_request)

for cls in classifiers_request:
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train_request, y_train_request, X_test_request, y_test_request, seed=seed,classes=classes_names)
    statistics_ML_request = statistics_ML_request.append({'Method': str(type(cls).__name__),
                                                            'ACC': float(ACC),
                                                            #'AUROC': float(AUROC),
                                                            'precision': float(precision),
                                                            'recall': float(recall),
                                                            'f1-score': float(f1score)}, ignore_index=True)

statistics_ML_request

In [None]:
# Response

# create a dataframe for ML baseline
statistics_ML_response = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])

classes_names = np.unique(Ydata_response)

for cls in classifiers_response:
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train_response, y_train_response, X_test_response, y_test_response, seed=seed,classes=classes_names)
    statistics_ML_response = statistics_ML_response.concat({'Method': str(type(cls).__name__),
                                                            'ACC': float(ACC),
                                                            #'AUROC': float(AUROC),
                                                            'precision': float(precision),
                                                            'recall': float(recall),
                                                            'f1-score': float(f1score)}, ignore_index=True)

statistics_ML_response

In [None]:
# df_ML.to_csv('ML_results.csv', index=False) # write to file the results

### Export model

In [None]:
from joblib import dump, load

dump(encoders, '../persistance/encoders.joblib')
encoders2 = load('../persistance/encoders.joblib')

transformed2 = encoders2.transform(df_test)
transformed_df_test = pd.DataFrame(
    transformed2,
    columns=encoders2.get_feature_names_out()
)

### Try a better classifier for the best ML method

We are using the best methods from baseline to find better hyperparameters for a better model.

In [None]:
# out best model was RF:
cls=RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights)

In [None]:
# check all the parameters
cls.get_params()

In [None]:
# define a list of classifiers to train with different params
classifiers = [
    RandomForestClassifier(n_estimators=10, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=20, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=50, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=100, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=200, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=300, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state=seed,class_weight=class_weights),
]

In [None]:
# create a dataframe for ML baseline
df_ML2 = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])
df_ML2

for cls in classifiers:
    print("\n**********************************\n", cls)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=classes_names)
    df_ML2 = df_ML2.append({'Method': str(type(cls).__name__)+'-NoTrees='+str(cls.get_params()['n_estimators']),
                            'ACC': float(ACC),
                            #'AUROC': float(AUROC),
                            'precision': float(precision),
                            'recall': float(recall),
                            'f1-score': float(f1score)}, ignore_index=True)


In [None]:
df_ML2

In [None]:
df_ML2.to_csv('ML_results_best1.csv', index=False)

### Grid search - search for the best params

In [None]:
paramsx = {'bootstrap': [True, False],
           'max_depth': [10, 20, 30, 40, 50, None],
           'max_features': ['auto', 'sqrt'],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'n_estimators': [50]
          }

In [None]:
forest= RandomForestClassifier(random_state=seed,class_weight=class_weights)

In [None]:
gridF = GridSearchCV(forest, paramsx, cv = 3, verbose = 2, n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
bestF.best_params_ # params of the best model

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

def evaluate(model, x_test, y_test):
    labels = np.unique(y_test)
    y_pred = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    cm = confusion_matrix(y_test,y_pred, labels=labels)
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(xticks_rotation='vertical')

In [None]:
best_grid = bestF.best_estimator_ # the best model from grid search

evaluate(best_grid,X_test,y_test)

### Feature importance

In [None]:
# calculate ACC
y_pred=clf.predict(X_test)
print(list(clf.classes_))
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

feature_imp = pd.Series(clf.feature_importances_,index=df.columns[:-1]).sort_values(ascending=False)
feature_imp[:30]