**Install Required Packages**

In [1]:
pip install pandas numpy scikit-learn xgboost skl2onnx onnx onnxmltools

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


Restart kernal to use updated packages

***Insert code to restart kernal to use updated packages***

In [2]:
#restart session
exit()

In [1]:
import pandas as pd  # For DataFrame operations (pd.read_csv, pd.cut, pd.merge, etc.)
import numpy as np  # For numerical operations and handling NaN (implied by fillna and array operations)
from sklearn.model_selection import train_test_split  # For splitting data into training and validation sets
from sklearn.model_selection import GridSearchCV  # For hyperparameter tuning with grid search
from sklearn.metrics import f1_score, make_scorer, accuracy_score # For F1 score and custom scoring in GridSearchCV
import xgboost as xgb  # For XGBoost classifier and DMatrix operations
from onnxmltools.convert import convert_xgboost # For converting XGBoost model to ONNX format
from skl2onnx.common.data_types import FloatTensorType  # For defining ONNX input types
import onnx  # For saving the ONNX model

**Import URL Phishing Data**

In [2]:
# this cell is not executed from MLTK and should only be used for staging data into the notebook environment
def stage(name):
    with open("data/"+name+".csv", 'r') as f:
        df = pd.read_csv(f)
    return df

In [3]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing purposes
df = stage("train")
print(df.head())
print(df.shape)

       FILENAME                                                URL  URLLength  \
0  mw205674.txt                        http://www.danangluxury.com         26   
1    712147.txt                    https://www.leedstownhall.co.uk         30   
2    806061.txt                      https://www.mexicancafe.co.nz         28   
3    164934.txt                         https://www.usglassmag.com         25   
4   8131216.txt  https://cloudflare-ipfs.com/ipfs/bafybeicivf4l...         93   

                    Domain  DomainLength  IsDomainIP  TLD  URLSimilarityIndex  \
0     www.danangluxury.com          20.0         0.0  com           75.000000   
1  www.leedstownhall.co.uk          23.0         0.0   uk          100.000000   
2    www.mexicancafe.co.nz          21.0         0.0   nz          100.000000   
3       www.usglassmag.com          18.0         0.0  com          100.000000   
4      cloudflare-ipfs.com          19.0         0.0  com           23.030879   

   CharContinuationRate  T

**Data Preparation of Training and Test Dataset**

In [4]:
#Create bins for URL length
labels=['0-25', '25-30', '30-35', '35-40', '40-45', '45-50', '50-100']
bins=[0,25,30,35,40,45,50,100]
df['URL_LengthRange']=pd.cut(df['URLLength'], bins=bins, labels=labels, right=True)
range_count=df.groupby('URL_LengthRange').apply(lambda x:(x['label']==0).mean())*100

#Drop isHTTP to avoid overfitting
df=df.drop(columns=['IsHTTPS'])

#Create feature buckets for the TLD URL Phishing percentage. 4 buckets of 25% each
#Count of URLs with each TLD and number of URLs flagged as Phishing
tld_dist=df.groupby('TLD')['label'].agg(['count','sum'])

#Perc of URLs that are phishing
tld_dist['phishing_perc']=tld_dist['sum']/tld_dist['count']*100
def categorize(phish_perc):
  if phish_perc<=0:
    return 1
  elif phish_perc<=25:
    return 2
  elif phish_perc<=50:
    return 3
  elif phish_perc<=75:
    return 4
  elif phish_perc<100:
    return 5
  else:
    return 6

#Apply function to add phish percent number to tld df and add that to the main df.
#Also remove 'TLD' as no longer needed with new feature
tld_dist['percent group']=tld_dist['phishing_perc'].apply(categorize)

#left outer join (original df) to preseve order
df=df.merge(tld_dist[['percent group']], on='TLD', how='left')
df=df.drop(columns=['TLD'])

#Now we need to assure that any features added to our training set are also reflected in our test set

#Import test data
#df_test=pd.read_csv('test.csv')
df_test = stage("test")

#repeat feature engineering for test set
#df_test['URL_LengthRange']=pd.cut(df_test['URLLength'], bins=bins, labels=labels, right=True)
df_test=df_test.drop(columns=['IsHTTPS'])

#Apply function to add TLD percent group to test dataframe and remove 'TLD' column
df_test=df_test.merge(tld_dist[['percent group']], on='TLD', how='left')
df_test=df_test.drop(columns=['TLD'])

#Features associated with characters in the URL
df['ObfuscationRatio']=df['ObfuscationRatio']*100
df['LetterRatioInURL']=df['LetterRatioInURL']*100
df['DigitRatioInURL']=df['DegitRatioInURL']*100
df['SpecialCharRatioInURL']=df['SpacialCharRatioInURL']*100

#digit and special char fields were spelled wrong so I made a new field for them
df=df.drop(columns=['DegitRatioInURL','SpacialCharRatioInURL'])

#Do the same fix for Test set to avoid issues
df_test['SpecialCharRatioInURL']=df_test['SpacialCharRatioInURL']
df_test['DigitRatioInURL']=df_test['DegitRatioInURL']
df_test=df_test.drop(columns=['DegitRatioInURL','SpacialCharRatioInURL'])

#drop columns with potential leakage
X_raw=df.drop(columns=['label','URL','Domain','Title','FILENAME', 'URL_LengthRange'])

#y labels
y=df['label']

#reserve original test df and remove leakage from test df
df_test_OG=df_test.copy()

#drop columns from test to avoid leakage
df_test=df_test.drop(columns=['URL','Domain','Title','FILENAME'])
columns_wNANs=X_raw.columns[X_raw.isna().any()].tolist()

#fill null columns with nan
X_raw=X_raw.fillna(0)
df_test=df_test.fillna(0)
X=X_raw

  range_count=df.groupby('URL_LengthRange').apply(lambda x:(x['label']==0).mean())*100
  range_count=df.groupby('URL_LengthRange').apply(lambda x:(x['label']==0).mean())*100


All of the cells above up until this point represent the following:


*   Importing of the data
*   Importing the necessary libraries and functions
*   Data Analysis
*   Feature Engineering

Data Analysis: Taught us that the length of the URL was extremely indicative of whether the URL was safe or not. We also found that all URLs with HTTP protocal was 100% classified as phishing. We chose to drop this field to avoid overfitting.

Feature Engineering: We created a feature to present the likelihood a URLs TLD (.com, .org, etc.) was phishing or not. This allowed each data point to be placed in a phishing percent bucket based on their TLD. We know that characters within a URL have a strong correlation to it's safety, so we scaled our URL character features (those obfuscated, digits, letter and special characters). We did that by mutliplying them by 100 and making them more impactful and prominent in the algorithm. We dropped characters with potential data leakage and null values and we made sure to mimic all feature engineering done to the training set also on our test set for later use.

**Create Cross Validation Set**

We split our training data into a training and a cross validation set. This will help us evaluate the model and parameters during the training set and also help prevent overfitting.

In [5]:
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=.2, random_state=14)
X_train.head()
#reorder columns of test to ensure they match order or X_train
df_test=df_test[X_train.columns]

**Grid Search for Optimal Parameters**

We are going to search for our model's most optimal parameters via a cross validated grid search over our specified grid.

Some notes on our parameters:
* loss: Default='log_loss' logistic regression good for classification
* learning_rate: shrinks contribution of each tree by rate designated
* n_estimators: Number of boosting stages to perform. Gradient boosting tends to be more. As mentioned this algoritgm is quire resilient to overfitting so large number of stages may lead to better performance.
* min_samples_split: Minimum number of samples required to split an internal node
* max_depth: Max depth of individual regression estimators which limits the number of nodes in a single tree. Value of this is dependent on number of input variables. If none, than all nodes expanded until all leaves are pure.
* subsample: Fraction of samples to be used for fitting the individual base learners.
* For scoring we will use F1 as a measure of our models predictive power. F1 being a harmonic mean between accurate positive predicted cases out of all positive predicted cases (Precision) and the positive predicted cases out of all positive cases
* CV: Number of folds used for cross validation
* n-jobs: Number of jobs to run in parallel. 1 for none and -1 for all processors

In [6]:
#Testing various parameters grids to assure the model is not overfitting
param_grid={'n_estimators':[5, 30, 50], 'max_depth':[1, 5, 12], 'learning_rate': [0.03, 0.02, 0.05]}

# Fill NaN with the most frequent value in y_train
y_train = y_train.fillna(y_train.mode()[0])

grid_search=GridSearchCV(estimator=xgb.XGBClassifier(eval_metric='logloss', enable_categorical=True),
                         param_grid=param_grid,
                         scoring= make_scorer(f1_score),
                         cv=5,
                         n_jobs=-1,
                         verbose=3,
                         error_score="raise"
                         )

#Start grid search
grid_search.fit(X_train, y_train)
#get most optimal model params
best_params=grid_search.best_estimator_

print(f"Optimal parameters for model: {grid_search.best_params_} ")
print(f"Best CV F1 score on training set: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Optimal parameters for model: {'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 30} 
Best CV F1 score on training set: 0.9999


**Validate Optimal Parameters**

Next we will assess how our validation set performs using our optimal parameters

In [7]:
y_val_predictions=best_params.predict(X_val)
val_score=f1_score(y_val, y_val_predictions)

print(f"f1 score for Validation set leveraging best model found: {val_score:.4f}")

f1 score for Validation set leveraging best model found: 0.9999


As you can see the model performed very well so if we are comfortable with it we can now test the model and parameters on our real world data in Splunk.



**Train XGBoost Model**

In order to convert the model to XGboost we need it to be in a specific format.

We will leverage the best parameters we found earlier and we will set the number of rounds to an arbitrary 100. We need to convert our dataframe to a Dmatrix. We can then train XGboost and then convert the model to ONNX format and save

In [8]:
# Rename columns in X_train and X_val (need f%d format for convert xgboost)
X_train.columns = [f'f{i}' for i in range(X_train.shape[1])]
X_val.columns = [f'f{i}' for i in range(X_val.shape[1])]
#set paramters found earlier and number of rounds
param=grid_search.best_params_
num_rounds=100

#convert df to Dmatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)

#train XGboost
bst=xgb.train(param, dtrain, num_rounds)

#Make predictions
preds=bst.predict(dtest)
predictions=[round(value) for value in preds]

#Calculate Accuracy
accuracy=accuracy_score(y_val, predictions)
print("Accuracy on val set: %.2f%%" % (accuracy*100.0))

Parameters: { "n_estimators" } are not used.



Accuracy on val set: 99.97%


**Convert XGBoost Model to ONNX**

In [9]:
#Covert to XGBoost model to ONNX
initial_types = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onx = convert_xgboost(bst, initial_types=initial_types, target_opset=12)

# Save the ONNX model
#import onnx #Adding this line imports the onnx module
onnx.save(onx, 'data/xgboost_model.onnx')

[CV 1/5] END learning_rate=0.03, max_depth=1, n_estimators=5;, score=0.737 total time=   0.5s
[CV 5/5] END learning_rate=0.03, max_depth=1, n_estimators=30;, score=0.997 total time=   0.6s
[CV 3/5] END learning_rate=0.03, max_depth=5, n_estimators=5;, score=0.737 total time=   0.5s
[CV 1/5] END learning_rate=0.03, max_depth=5, n_estimators=50;, score=1.000 total time=   0.8s
[CV 4/5] END learning_rate=0.03, max_depth=12, n_estimators=5;, score=0.737 total time=   0.5s
[CV 2/5] END learning_rate=0.03, max_depth=12, n_estimators=50;, score=1.000 total time=   1.3s
[CV 3/5] END learning_rate=0.02, max_depth=1, n_estimators=30;, score=0.996 total time=   0.9s
[CV 1/5] END learning_rate=0.02, max_depth=5, n_estimators=5;, score=0.737 total time=   0.6s
[CV 2/5] END learning_rate=0.02, max_depth=5, n_estimators=30;, score=1.000 total time=   0.9s
[CV 5/5] END learning_rate=0.02, max_depth=5, n_estimators=50;, score=1.000 total time=   1.0s
[CV 3/5] END learning_rate=0.02, max_depth=12, n_est