# Prediction of OS type using Machine Learning based on OS fingerprints

In [1]:
import numpy as np
import pandas as pd
import feather

In [2]:
seed = 2021
np.random.seed(seed)

### Read dataset from disk

In [3]:
df = pd.read_feather("./dataset.feather", use_threads=True);

In [4]:
df.head()

Unnamed: 0,Class.OSfamily_0,SEQ.SP_0,SEQ.GCD_0,SEQ.ISR_0,SEQ.TI_0,SEQ.TI_1,SEQ.CI_0,SEQ.CI_1,SEQ.II_0,SEQ.II_1,...,U1.RIPCK_1,U1.RUCK_0,U1.RUCK_1,U1.RUD_0,IE.R_0,IE.DFI_0,IE.T_0,IE.TG_0,IE.CD_0,IE.CD_1
0,Linux,257,1,261,5,-1,-1,-1,5,-1,...,-1,-1,-1,-1,-1,1,124,128,2,-1
1,Linux,187,5,195,1,-1,1,-1,5,-1,...,-1,1,-1,1,-1,1,65,64,2,-1
2,Linux,2,64001,150,5,-1,5,-1,5,-1,...,-1,1,-1,1,-1,2,64,64,1,-1
3,Linux,0,128002,155,5,-1,5,-1,5,-1,...,-1,1,-1,1,-1,2,59,64,1,-1
4,Linux,2,192003,152,5,-1,5,-1,5,-1,...,-1,1,-1,1,-1,2,65,64,1,-1


In [5]:
# header = names of columns
print(list(df.columns))

['Class.OSfamily_0', 'SEQ.SP_0', 'SEQ.GCD_0', 'SEQ.ISR_0', 'SEQ.TI_0', 'SEQ.TI_1', 'SEQ.CI_0', 'SEQ.CI_1', 'SEQ.II_0', 'SEQ.II_1', 'SEQ.SS_0', 'SEQ.TS_0', 'SEQ.TS_1', 'OPS.O1_0', 'OPS.O1_1', 'OPS.O1_2', 'OPS.O1_3', 'OPS.O1_4', 'OPS.O1_5', 'OPS.O1_6', 'OPS.O1_7', 'OPS.O1_8', 'OPS.O1_9', 'OPS.O2_0', 'OPS.O2_1', 'OPS.O2_2', 'OPS.O2_3', 'OPS.O2_4', 'OPS.O2_5', 'OPS.O2_6', 'OPS.O2_7', 'OPS.O2_8', 'OPS.O2_9', 'OPS.O3_0', 'OPS.O3_1', 'OPS.O3_2', 'OPS.O3_3', 'OPS.O3_4', 'OPS.O3_5', 'OPS.O3_6', 'OPS.O3_7', 'OPS.O3_8', 'OPS.O3_9', 'OPS.O3_10', 'OPS.O3_11', 'OPS.O4_0', 'OPS.O4_1', 'OPS.O4_2', 'OPS.O4_3', 'OPS.O4_4', 'OPS.O4_5', 'OPS.O4_6', 'OPS.O4_7', 'OPS.O4_8', 'OPS.O4_9', 'OPS.O5_0', 'OPS.O5_1', 'OPS.O5_2', 'OPS.O5_3', 'OPS.O5_4', 'OPS.O5_5', 'OPS.O5_6', 'OPS.O5_7', 'OPS.O5_8', 'OPS.O5_9', 'OPS.O6_0', 'OPS.O6_1', 'OPS.O6_2', 'OPS.O6_3', 'OPS.O6_4', 'OPS.O6_5', 'OPS.O6_6', 'WIN.W1_0', 'WIN.W2_0', 'WIN.W3_0', 'WIN.W4_0', 'WIN.W5_0', 'WIN.W6_0', 'ECN.R_0', 'ECN.DF_0', 'ECN.T_0', 'ECN.TG_0', 'ECN.

In [6]:
# no of features (X)
print("No features=", len(list(df.columns))-1)

No features= 268


In [7]:
# output name
print("Output=", list(df.columns)[0])

Output= Class.OSfamily_0


In [8]:
# define output variable name
OutVar = list(df.columns)[0]
print(OutVar)

Class.OSfamily_0


### Checking data

In [9]:
def DataCheckings(df):
    # Check the number of data points in the data set
    print("\nData points =", len(df))
    
    # Check the number of columns in the data set
    print("\nColumns (output + features)=",len(df.columns))
    
    # Check the data types
    print("\nData types =", df.dtypes.unique())
    
    # Dataset statistics
    print('\n')
    df.describe()
    
    # print names of columns
    print('Column Names:\n', df.columns)
    
    # see if there are categorical data
    print("\nCategorical features:", df.select_dtypes(include=['O']).columns.tolist())
    
    # Check NA values
    # Check any number of columns with NaN
    print("\nColumns with NaN: ", df.isnull().any().sum(), ' / ', len(df.columns))

    # Check any number of data points with NaN
    print("\nNo of data points with NaN:", df.isnull().any(axis=1).sum(), ' / ', len(df))

In [10]:
DataCheckings(df)


Data points = 264854

Columns (output + features)= 269

Data types = [dtype('O') dtype('int64')]


Column Names:
 Index(['Class.OSfamily_0', 'SEQ.SP_0', 'SEQ.GCD_0', 'SEQ.ISR_0', 'SEQ.TI_0',
       'SEQ.TI_1', 'SEQ.CI_0', 'SEQ.CI_1', 'SEQ.II_0', 'SEQ.II_1',
       ...
       'U1.RIPCK_1', 'U1.RUCK_0', 'U1.RUCK_1', 'U1.RUD_0', 'IE.R_0',
       'IE.DFI_0', 'IE.T_0', 'IE.TG_0', 'IE.CD_0', 'IE.CD_1'],
      dtype='object', length=269)

Categorical features: ['Class.OSfamily_0']

Columns with NaN:  0  /  269

No of data points with NaN: 0  /  264854


In [11]:
print('Shape before removing duplicates=', df.shape)

Shape before removing duplicates= (264854, 269)


In [12]:
# remove duplicates!
df.drop_duplicates(keep=False, inplace=True)

In [13]:
print('Shape after removing duplicates=', df.shape)

Shape after removing duplicates= (264852, 269)


### Remove near zero variance features

In [14]:
from sklearn.utils import class_weight
from sklearn.feature_selection import VarianceThreshold

In [15]:
def getDataFromDataFrame(df, OutVar):
    # get X, Y data and column names from df
    print('\n-> Get X & Y data, Features list')
    print('Shape', df.shape)
    
    # select X and Y
    ds_y = df[OutVar]
    ds_X = df.drop(OutVar,axis = 1)
    Xdata = ds_X.values # get values of features
    Ydata = ds_y.values # get output values

    print('Shape X data:', Xdata.shape)
    print('Shape Y data:', Ydata.shape)
    
    # return data for X and Y, feature names as list
    print('Done!')
    return (Xdata, Ydata, list(ds_X.columns))

def Remove0VarCols(df, OutVar):
    Xdata, Ydata, Features = getDataFromDataFrame(df,OutVar=OutVar)# out var = Class 
    print('\n-> Remove zero variance features')
    # print('Initial features:', Features)
    selector= VarianceThreshold()
    Xdata = selector.fit_transform(Xdata)
    # Selected features
    SelFeatures = []
    for i in selector.get_support(indices=True):
        SelFeatures.append(Features[i])
    print('Removed features:',list(set(Features) - set(SelFeatures)))
    
    # create the resulted dataframe
    df = pd.DataFrame(Xdata,columns=SelFeatures)
    df[OutVar] = Ydata # add class column
    # print('Final columns:', list(df.columns))
    print('Done!')
    return df

In [16]:
df = Remove0VarCols(df, OutVar)


-> Get X & Y data, Features list
Shape (264852, 269)
Shape X data: (264852, 268)
Shape Y data: (264852,)
Done!

-> Remove zero variance features
Removed features: ['T4.F_3', 'T2.Q_0', 'T5.F_0', 'SEQ.CI_1', 'T6.F_0', 'T2.F_1', 'T7.F_5', 'ECN.Q_0', 'T4.Q_0', 'T5.F_1', 'SEQ.TI_1', 'T6.F_3', 'T5.Q_0', 'T4.F_1', 'T4.F_0', 'T1.Q_0', 'T7.F_0', 'T6.F_1', 'T7.F_3', 'T1.F_0', 'T3.F_1', 'T1.F_1', 'T2.F_0', 'T6.Q_0', 'T1.F_6', 'IE.CD_1', 'U1.RIPCK_1', 'T3.Q_0', 'T7.F_1', 'T3.F_3', 'T3.F_0', 'T5.F_3', 'T4.F_5', 'T7.Q_0', 'SEQ.II_1']
Done!


In [17]:
# print dimension AFTER removing features
print("Dataset dimension AFTER removing near zero variance features=",df.shape)

Dataset dimension AFTER removing near zero variance features= (264852, 234)


In [18]:
df.columns

Index(['SEQ.SP_0', 'SEQ.GCD_0', 'SEQ.ISR_0', 'SEQ.TI_0', 'SEQ.CI_0',
       'SEQ.II_0', 'SEQ.SS_0', 'SEQ.TS_0', 'SEQ.TS_1', 'OPS.O1_0',
       ...
       'U1.RIPCK_0', 'U1.RUCK_0', 'U1.RUCK_1', 'U1.RUD_0', 'IE.R_0',
       'IE.DFI_0', 'IE.T_0', 'IE.TG_0', 'IE.CD_0', 'Class.OSfamily_0'],
      dtype='object', length=234)

### Verify the classes ballance

In [19]:
df[OutVar].value_counts()

Linux      142548
Windows     67128
iOS         17232
macOS       16160
BSD         14278
Solaris      4926
Android      2580
Name: Class.OSfamily_0, dtype: int64

### Get data as arrays

In [20]:
# select X and Y
Ydata = df[OutVar].values                  # get values of features
Xdata = df.drop(OutVar,axis = 1).values    # get output values

print('Shape X data:', Xdata.shape)
print('Shape Y data:',Ydata.shape)

Shape X data: (264852, 233)
Shape Y data: (264852,)


In [21]:
Ydata

array(['Linux', 'Linux', 'Linux', ..., 'Linux', 'Linux', 'Linux'],
      dtype=object)

### Data split

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,
                                                    stratify=Ydata, 
                                                    test_size=0.10,
                                                    random_state=seed)

In [24]:
# verify dimentions of data for training and test
print('Shape X_train:', X_train.shape)
print('Shape X_test:' , X_test.shape)
print('Shape y_train:', y_train.shape)
print('Shape y_test:' , y_test.shape)

Shape X_train: (238366, 233)
Shape X_test: (26486, 233)
Shape y_train: (238366,)
Shape y_test: (26486,)


### ML

In [25]:
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [26]:
def set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(option, np.unique(y_data), y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w

In [None]:
class_weights = set_weights(Ydata)

In [28]:
print("Classes=",class_weights)

Classes= {'Android': 14.665116279069768, 'BSD': 2.649950973525704, 'Linux': 0.2654263826921458, 'Solaris': 7.6808769792935445, 'Windows': 0.5636396138720057, 'iOS': 2.195682451253482, 'macOS': 2.3413366336633663}


In [29]:
# define a list of classifiers to train as baseline classifiers
classifiers = [
    GaussianNB(),
    LinearDiscriminantAnalysis(), # No random_state
    LogisticRegression(n_jobs=-1,solver='lbfgs',random_state=seed,class_weight=class_weights),
    MLPClassifier(hidden_layer_sizes= (30), random_state = seed, shuffle=False, solver='adam',activation='relu',batch_size=500, max_iter=5000),
    DecisionTreeClassifier(random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights),
    BaggingClassifier(n_jobs=-1,random_state=seed),
]

In [29]:
# training and metrics (ACC, precision, recall, f1score) for a classifier
def ML_baseline(cls, X_tr, y_tr, X_ts, y_ts, seed=42, classes=['0','1']):
    ACC = 0
    AUROC = 0
    precision = 0 
    recall = 0
    f1score = 0
    
    cls_name = type(cls).__name__
    
    start_time = time.time()
    cls.fit(X_tr, y_tr) # TRAINING!
    print('\n---->', "training: %0.2f mins \n\n" % ((time.time() - start_time)/60))
    
    # predictions
    y_pred  = cls.predict(X_ts)             # predict classes
    y_probs = cls.predict_proba(X_ts)[:, 1] # predict probabilities of classes
    cls_rep = classification_report(y_ts, y_pred, target_names=classes,
                                    output_dict=True, digits=3)
    # print classification report
    #print(cls_rep)
    
    ACC       = accuracy_score(y_ts, y_pred)
    #AUROC     = roc_auc_score(y_ts, y_probs) # this is working for 2-classes classification only!!!
    precision = cls_rep['weighted avg']['precision']
    recall    = cls_rep['weighted avg']['recall']
    f1score   = cls_rep['weighted avg']['f1-score']  
    
    # print metrics
    print("\n", "ACC=", ACC, "precision=", precision, "recall=", recall, "f1score=",f1score)
    
    return cls, ACC, precision, recall, f1score

In [31]:
# create a dataframe for ML baseline
df_ML = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])

for cls in classifiers:
    print("\n**********************************\n", cls_name = type(cls).__name__)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=['1','2','3','4','5','6','7'])
    df_ML = df_ML.append({'Method': str(type(cls).__name__),
                          'ACC': float(ACC),
                          #'AUROC': float(AUROC),
                          'precision': float(precision),
                          'recall': float(recall),
                          'f1-score': float(f1score)}, ignore_index=True)

df_ML


**********************************
 GaussianNB()
> GaussianNB training: 0.01 mins 


 GaussianNB() ACC= 0.10764177301215737 precision= 0.6290698591863442 recall= 0.10764177301215737 f1score= 0.05813684642520353

**********************************
 LinearDiscriminantAnalysis()
> LinearDiscriminantAnalysis training: 0.15 mins 


 LinearDiscriminantAnalysis() ACC= 0.957335951068489 precision= 0.9627511448657574 recall= 0.957335951068489 f1score= 0.9585745382704255

**********************************
 LogisticRegression(class_weight={'Android': 14.665116279069768,
                                 'BSD': 2.649950973525704,
                                 'Linux': 0.2654263826921458,
                                 'Solaris': 7.6808769792935445,
                                 'Windows': 0.5636396138720057,
                                 'iOS': 2.195682451253482,
                                 'macOS': 2.3413366336633663},
                   n_jobs=-1, random_state=2021)
> LogisticRe

Unnamed: 0,Method,ACC,precision,recall,f1-score
0,GaussianNB,0.107642,0.62907,0.107642,0.058137
1,LinearDiscriminantAnalysis,0.957336,0.962751,0.957336,0.958575
2,LogisticRegression,0.110134,0.781292,0.110134,0.079551
3,MLPClassifier,0.931473,0.946398,0.931473,0.917231
4,DecisionTreeClassifier,0.952163,0.951682,0.952163,0.951853
5,RandomForestClassifier,0.960545,0.963604,0.960545,0.958437
6,BaggingClassifier,0.95794,0.96102,0.95794,0.957819


In [34]:
df_ML.to_csv('ML_results.csv', index=False) # write to file the results

### Try a better classifier for the best ML method

We are using the best methods from baseline to find better hyperparameters for a better model.

In [36]:
# out best model was RF:
cls=RandomForestClassifier(n_jobs=-1,random_state=seed,class_weight=class_weights)

In [37]:
# check all the parameters
cls.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': {'Android': 14.665116279069768,
  'BSD': 2.649950973525704,
  'Linux': 0.2654263826921458,
  'Solaris': 7.6808769792935445,
  'Windows': 0.5636396138720057,
  'iOS': 2.195682451253482,
  'macOS': 2.3413366336633663},
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [48]:
# define a list of classifiers to train with different params
classifiers = [
    RandomForestClassifier(n_estimators=10, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=20, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=50, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=100, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=200, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=300, n_jobs=-1,random_state=seed,class_weight=class_weights),
    RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state=seed,class_weight=class_weights),
]

In [49]:
# create a dataframe for ML baseline
df_ML2 = pd.DataFrame(columns=['Method', 'ACC','precision' ,'recall' ,'f1-score' ])
df_ML2

for cls in classifiers:
    print("\n**********************************\n", cls)
    cls_fit, ACC, precision,recall,f1score=ML_baseline(cls, X_train, y_train, X_test, y_test, seed=seed,classes=['Android', 'BSD', 'Linux', 'Solaris', 'Windows', 'iOS', 'macOS'])
    df_ML2 = df_ML2.append({'Method': str(type(cls).__name__)+'-NoTrees='+str(cls.get_params()['n_estimators']),
                            'ACC': float(ACC),
                            #'AUROC': float(AUROC),
                            'precision': float(precision),
                            'recall': float(recall),
                            'f1-score': float(f1score)}, ignore_index=True)



**********************************
 RandomForestClassifier(class_weight={'Android': 14.665116279069768,
                                     'BSD': 2.649950973525704,
                                     'Linux': 0.2654263826921458,
                                     'Solaris': 7.6808769792935445,
                                     'Windows': 0.5636396138720057,
                                     'iOS': 2.195682451253482,
                                     'macOS': 2.3413366336633663},
                       n_estimators=10, n_jobs=-1, random_state=2021)

----> training: 0.03 mins 



 ACC= 0.959185985048705 precision= 0.9613321153073082 recall= 0.959185985048705 f1score= 0.9585854966813839

**********************************
 RandomForestClassifier(class_weight={'Android': 14.665116279069768,
                                     'BSD': 2.649950973525704,
                                     'Linux': 0.2654263826921458,
                                     'Solaris': 7.6808769

In [50]:
df_ML2

Unnamed: 0,Method,ACC,precision,recall,f1-score
0,RandomForestClassifier-NoTrees=10,0.959186,0.961332,0.959186,0.958585
1,RandomForestClassifier-NoTrees=20,0.959413,0.961778,0.959413,0.95849
2,RandomForestClassifier-NoTrees=50,0.960961,0.964005,0.960961,0.959371
3,RandomForestClassifier-NoTrees=100,0.960545,0.963604,0.960545,0.958437
4,RandomForestClassifier-NoTrees=200,0.960168,0.962912,0.960168,0.957765
5,RandomForestClassifier-NoTrees=300,0.960243,0.963075,0.960243,0.95784
6,RandomForestClassifier-NoTrees=500,0.960394,0.963364,0.960394,0.957887


In [51]:
df_ML2.to_csv('ML_results_best1.csv', index=False)

In [46]:
# list(cls.classes_)

### Grid search - search for the best params

In [30]:
paramsx = {'bootstrap': [True, False],
           'max_depth': [10, 20, 30, 40, 50, None],
           'max_features': ['auto', 'sqrt'],
           'min_samples_leaf': [1, 2, 4],
           'min_samples_split': [2, 5, 10],
           'n_estimators': [50]
          }

In [31]:
forest= RandomForestClassifier(random_state=seed,class_weight=class_weights)

In [32]:
gridF = GridSearchCV(forest, paramsx, cv = 3, verbose = 2, n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [33]:
bestF.best_params_ # params of the best model

{'bootstrap': False,
 'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 50}

In [34]:
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    print(confusion_matrix(y_test,y_pred))

In [35]:
best_grid = bestF.best_estimator_ # the best model from grid search

evaluate(best_grid,X_test,y_test)

0.9609605074378917
[[  248     0    10     0     0     0     0]
 [    0  1428     0     0     0     0     0]
 [  255     0 14000     0     0     0     0]
 [    0     0     0   493     0     0     0]
 [    0     0     0     0  6713     0     0]
 [    0     2     0     0     0  1632    89]
 [    0     0     1     0     0   677   938]]


Classes= {'Android': 14.665116279069768, 'BSD': 2.649950973525704, 'Linux': 0.2654263826921458, 'Solaris': 7.6808769792935445, 'Windows': 0.5636396138720057, 'iOS': 2.195682451253482, 'macOS': 2.3413366336633663}

Linux      142548
Windows     67128
iOS         17232
macOS       16160
BSD         14278
Solaris      4926
Android      2580

### Feature importance

In [None]:
# calculate ACC
y_pred=clf.predict(X_test)
print(list(clf.classes_))
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

feature_imp = pd.Series(clf.feature_importances_,index=df.columns[:-1]).sort_values(ascending=False)
feature_imp[:30]