# OS Fingerprinting based on ML and p0f dataset

In [None]:
import numpy as np
import pandas as pd
import p0f_db_parser as parser

In [None]:
seed = 42
np.random.seed(seed)

### Parse database and import dataset

In [None]:
dataset,column_names = parser.parse_database("p0f.fp")
df = pd.DataFrame(dataset,columns=column_names)

### Explore dataset

In [None]:
print("Initial dataset")
df.head()

In [None]:
def DataCheckings(df):
    # Check the number of data points in the data set
    print("\nData points =", len(df))
    
    # Check the number of columns in the data set
    print("\nColumns (output + features)=",len(df.columns))
    
    # Check the data types
    print("\nData types =", df.dtypes.unique())

    # Check the number of duplicates
    print("Number of duplicates: ", df.duplicated().sum())

    # Dataset statistics
    print('\n')
    df.describe()
    
    # print names of columns
    print('Column Names:\n', df.columns)
    
    # see if there are categorical data
    print("\nCategorical features:", df.select_dtypes(include=['O']).columns.tolist())
    
    # Check NA values
    # Check any number of columns with NaN
    print("\nColumns with NaN: ", df.isnull().any().sum(), ' / ', len(df.columns))

    # Check any number of data points with NaN
    print("\nNumber of data points with NaN:", df.isnull().any(axis=1).sum(), ' / ', len(df))
    
    # List of values per column
    print()
    for column in df.columns:
        print(column + " -> ")
        print(df[column].value_counts())
        print()

In [None]:
DataCheckings(df)

In [None]:
# Checking duplicates

print("Number of duplicates: ", df.duplicated().sum())

### Encoding of the dataset

In [None]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import transformers as tr

#### Filter Operating Systems

In [None]:
# Filter OSes
df = df[df.os.isin(['Linux', 'Windows', 'Mac OS X', 'Solaris', 'OpenBSD', 'FreeBSD'])]
df.replace({'os': {'FreeBSD': 'BSD', 'OpenBSD': 'BSD'}}, inplace=True)

df.reset_index(inplace=True, drop=True)

# Drop version column
df.drop(["version","sig_direction"], inplace=True, axis=1)

In [None]:
df.info()

#### Data Augmentation

In [None]:
# Data Augmentation
# TTL, MSS & Windows size

# ttl_factor = 10

# array = df.to_numpy()
# ttl_i = df.columns.get_loc('initial_ttl')

# for row in array:
#     for i in range(1,ttl_factor+1):
#         new_row = row.copy()
#         new_row[ttl_i] = row[ttl_i] - i
        
#         array = np.vstack((array, new_row))
        
# df = pd.DataFrame(array, columns = df.columns)

#### TTL

In [None]:
# TTL
# Numeric value

encoder_ttl = MinMaxScaler()

#### MSS

In [None]:
# MSS
# Categorical encoding

# encoder_mss = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')
df.drop('mss', inplace=True, axis=1)

#### Window Size

In [None]:
# Window Size

encoder_window_size = tr.WindowSizeTransformer

#### Windows Scaling

In [None]:
# Window Scaling 
# Categorical encoding

encoder_window_scaling = OneHotEncoder(drop=['*'], sparse=False, handle_unknown='ignore')

#### TCP options

In [None]:
# TCP Options
# Custom transformer

encoder_tcp_options = tr.TCPOptionsTransformer

#### Quirks

In [None]:
# Quirks
# Categorical encoding (already encoded)

#### Applying encodings

In [None]:
# Apply encodings

from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

df.reset_index(inplace=True, drop=True)

encoders = make_column_transformer(
    (encoder_ttl, ['initial_ttl']),
    (encoder_window_size, ['window_size']),
    (encoder_window_scaling, ['window_scaling']),
    (encoder_tcp_options, ['tcp_options']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = encoders.fit_transform(df)
transformed_df = pd.DataFrame(
    transformed,
    columns=encoders.get_feature_names_out()
)

df = transformed_df

# define output variable name
OutVar = df.os.name

In [None]:
DataCheckings(df)

In [None]:
df = df.drop_duplicates()

#### Dataset split

In [None]:
# Create two datasets: requests and responses

# df = df[df.sig_direction.isin(['request'])].drop('sig_direction', axis=1)
# df_response = df[df.sig_direction.isin(['response'])].drop('sig_direction', axis=1)

# df.reset_index(inplace=True, drop=True)
# df_response.reset_index(inplace=True, drop=True)

# del df

### Verify the classes ballance

In [None]:
df[OutVar].value_counts()

### Get data as arrays

In [None]:
# Request

Ydata = df[OutVar].values                  # get values of features
Xdata = df.drop(OutVar,axis = 1).values    # get output values

print('Shape X data:', Xdata.shape)
print('Shape Y data:',Ydata.shape)

### Data split

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# # Request

# X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,
#                                                                     stratify=Ydata, 
#                                                                     test_size=0.10,
#                                                                     random_state=seed)

# # verify dimentions of data for training and test
# print('Shape X_train:', X_train.shape)
# print('Shape X_test:' , X_test.shape)
# print('Shape y_train:', y_train.shape)
# print('Shape y_test:' , y_test.shape)

### ML

In [None]:
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

##### Classes balance

In [None]:
def set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(class_weight=option, classes=np.unique(y_data), y=y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w

class_weights = set_weights(Ydata)

print("Request balance => ",class_weights)

##### Classifiers definition

In [None]:
# define a list of classifiers to train as baseline classifiers
classifiers = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights),
    BaggingClassifier(n_jobs=-1,random_state=seed)
]

##### Training

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold

hiper_k_rango = range(1,30)
scores=[]
RKFold_stratified = RepeatedStratifiedKFold(n_splits=10,n_repeats=50)

for k in hiper_k_rango:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    resultados_kfold = cross_val_score(knn_model,Xdata,Ydata,cv=RKFold_stratified,scoring='accuracy')
    scores.append(resultados_kfold.mean())
    
scores_media_Kfold = scores

print(scores)
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(hiper_k_rango,scores)
plt.xlabel("Valores de k")
plt.ylabel("Accuracy en 50 repeticiones 10-fold cross-validation")

### Export transformers & model

In [None]:
# from joblib import dump, load

# # Transformers
# dump(encoders, '../../persistence/p0f/p0f_encoders.joblib')

# # Models
# dump(models_ML.Model.values[2],'../../persistence/p0f/p0f_classifier.joblib')