# XGBoost + RAPIDS model training

This notebook is a simple example of how to train an XGBoost classification model using RAPIDS that can be saved and used for FIL inference. 
We will be using this labeled [dataset](https://www.stratosphereips.org/datasets-iot23) of malicious and begnin IoT network traffic from Stratosphere Labs.

## Imports

In [1]:
import xgboost as xgb
import cudf
from cuml.preprocessing.model_selection import train_test_split

import s3fs
from os import path

## Data Download

In [2]:
# Download sample data and model
IOT_MALWARE_JSON="iot_malware_1_1.json"
S3_BASE_PATH = "rapidsai-data/cyber/clx"

# IoT data in json format
if not path.exists(IOT_MALWARE_JSON):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(S3_BASE_PATH + "/" + IOT_MALWARE_JSON, IOT_MALWARE_JSON)
    

In [3]:
data_types = {"ts":"float64","uid":"str", "id.orig_h":"str", "id.orig_p":"int64", "id.resp_h":"str", "id.resp_p":"int64", "proto":"str",
    "service":"str", "duration":"str", "orig_bytes":"str", "resp_bytes":"str", "conn_state":"str","local_orig":"str",
    "local_resp":"str", "missed_bytes":"int64", "history":"str", "orig_pkts":"int64", "orig_ip_bytes":"int64", "resp_pkts":"int64",
    "resp_ip_bytes":"int64", "tunnel_parents":"str", "label":"str", "detailed-label":"str"}

In [4]:
# from json
df = cudf.io.json.read_json(IOT_MALWARE_JSON, lines =True, orient='columns', dtype = data_types)

In [5]:
df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1525898000.0,CIsut41qu1NHSzSuu1,192.168.100.103,43763,96.71.155.35,37917,udp,-,,,...,-,0,D,1,40,0,0,(empty),Benign,-
1,1525898000.0,CdN5fg3EHVGF295pKe,192.168.100.103,40392,180.178.132.131,2323,tcp,-,2.998791,0.0,...,-,0,S,3,180,0,0,(empty),Malicious,PartOfAHorizontalPortScan
2,1525898000.0,CpGDCZ3rP0cggJIRIc,192.168.100.103,43763,56.245.21.79,20678,udp,-,,,...,-,0,D,1,40,0,0,(empty),Benign,-
3,1525898000.0,CUdjLI3MRrS8hiutEd,192.168.100.103,51177,163.35.213.20,23,tcp,-,2.998538,0.0,...,-,0,S,3,180,0,0,(empty),Malicious,PartOfAHorizontalPortScan
4,1525898000.0,CnvQbA2sSGxXX2MPqg,192.168.100.103,41327,102.72.255.101,23,tcp,-,,,...,-,0,S,1,60,0,0,(empty),Malicious,PartOfAHorizontalPortScan


## Convert categories to integers

In [6]:
df['label'], codes = df['label'].factorize()

In [7]:
codes

0       Benign
1    Malicious
Name: label, dtype: object

## Split training and testing data

In [8]:
#  80/20 split
X_train, X_test, y_train, y_test = train_test_split(df[["orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes"]], df['label'],
                                                    train_size=0.8)

## Move to DMatrix

In [9]:
dmatrix_train = xgb.DMatrix(X_train, label=y_train)
dmatrix_validation = xgb.DMatrix(X_test, label=y_test)

## Set Parameters

In [10]:
# learning task params
params = {'tree_method':'gpu_hist','eval_metric': 'auc', 'objective': 'binary:logistic', 'max_depth':6, 'learning_rate':0.1}

## Train Model

In [11]:
# model training settings
evallist = [(dmatrix_validation, 'validation'), (dmatrix_train, 'train')]
num_round = 10

In [12]:
bst = xgb.train(params, dmatrix_train, num_round, evallist)

[0]	validation-auc:0.95333	train-auc:0.95472
[1]	validation-auc:0.95333	train-auc:0.95472
[2]	validation-auc:0.95333	train-auc:0.95472
[3]	validation-auc:0.95333	train-auc:0.95472
[4]	validation-auc:0.95333	train-auc:0.95472
[5]	validation-auc:0.95336	train-auc:0.95473
[6]	validation-auc:0.95336	train-auc:0.95473
[7]	validation-auc:0.95336	train-auc:0.95473
[8]	validation-auc:0.95336	train-auc:0.95473
[9]	validation-auc:0.95336	train-auc:0.95473


## Save model

In [13]:
bst.save_model("iot_xgboost_model.bst")