#  Fraud Detection use XGBoost 

This notebook will demonstate using cuDF for ETL/data cleaning and XGBoost for training a fraud predection model.  
The processing will not use an additional graph processing or GNNs

In [1]:
# requiered imports
import cudf
import cuml
import xgboost as xgb
import math

  from pandas import MultiIndex, Int64Index


### Data Loading

In [2]:
# base directoty
base_dir = "./elliptic_bitcoin_dataset/"

In [3]:
# read the data files
df_features = cudf.read_csv(base_dir + 'elliptic_txs_features.csv', header=None)
df_classes  = cudf.read_csv(base_dir + "elliptic_txs_classes.csv")

# we do not need the edge dataset for this workflow
# df_edges    = cudf.read_csv(base_dir + "elliptic_txs_edgelist.csv")

### Let's look at the Class dataset

In [4]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


The documentation list class values as:</br>
1 = illicit</br>
2 = licit</br>
unknown = unknown </br>
</br>
It would be nice if the "class" column as an integer value and not string, so let's convert "unlnown" to 0


In [5]:
# replace the value and set the type to int32
df_classes['class'] = df_classes['class'].replace("unknown", "0").astype('int32')

In [6]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,0
1,5530458,0
2,232022460,0
3,232438397,2
4,230460314,0


### merge the classes into the feature dataset
but we might need to adjust the dataframe some

In [7]:
df_features.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [8]:
# change the column 0 name to be txId to match the classes dataframe
df_features.rename(columns={'0' : 'txId'}, inplace=True)

In [9]:
# drop "aggregated features" 
df_features = df_features.iloc[:, 0:95]

In [10]:
df_features.head(2)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.255111,-0.259194,1.12559,1.128038,-0.293773,-0.159732,0.034039,-0.183816,1.135523,1.135279
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,-0.255168,-0.259251,-0.187191,-0.185274,-0.293692,-0.7607,-0.692777,-0.719789,-1.084907,-1.084845


In [11]:
# merging dataframes
df_merge = df_features.merge(df_classes, how='left', on='txId')

### Pull out the labeled data into groups for training, validating, and testing

In [12]:
classified   = df_merge.loc[df_merge['class'] != 0]
unclassified = df_merge.loc[df_merge['class'] == 0]

In [13]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,class
6,8988238,6,-0.170733,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.163342,...,0.125159,-0.187191,-0.185274,-0.156403,-0.711518,-0.694235,-0.630983,0.025308,0.025217,2
18,230424162,1,-0.169956,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.160548,...,1.354381,1.12559,1.128038,3.114411,2.700735,1.980603,3.483066,1.135523,1.135279,2
19,232354200,1,-0.172157,-0.184479,1.018602,-0.046932,-0.063725,-0.02914,-0.061584,-0.163645,...,-0.255907,-0.187191,-0.185274,0.335654,-0.530923,-0.694235,-0.30884,0.025308,0.025217,2
22,230473471,1,-0.171958,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162597,...,-0.256506,1.12559,1.128038,-0.293649,3.179307,4.072051,2.794295,1.135523,1.135279,2
23,234472313,1,-0.171507,-0.184668,-1.201369,0.028105,-0.024025,0.054722,-0.061584,-0.163638,...,-0.259251,-0.187191,-0.185274,-0.292015,-0.730377,-0.66421,-0.70077,-0.007457,-0.529814,2


In [14]:
# reset the index 
classified.reset_index(inplace=True, drop=True)

In [15]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,class
0,8988238,6,-0.170733,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.163342,...,0.125159,-0.187191,-0.185274,-0.156403,-0.711518,-0.694235,-0.630983,0.025308,0.025217,2
1,230424162,1,-0.169956,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.160548,...,1.354381,1.12559,1.128038,3.114411,2.700735,1.980603,3.483066,1.135523,1.135279,2
2,232354200,1,-0.172157,-0.184479,1.018602,-0.046932,-0.063725,-0.02914,-0.061584,-0.163645,...,-0.255907,-0.187191,-0.185274,0.335654,-0.530923,-0.694235,-0.30884,0.025308,0.025217,2
3,230473471,1,-0.171958,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162597,...,-0.256506,1.12559,1.128038,-0.293649,3.179307,4.072051,2.794295,1.135523,1.135279,2
4,234472313,1,-0.171507,-0.184668,-1.201369,0.028105,-0.024025,0.054722,-0.061584,-0.163638,...,-0.259251,-0.187191,-0.185274,-0.292015,-0.730377,-0.66421,-0.70077,-0.007457,-0.529814,2


### Split data into training and validation sets
cuML has a nice function for doing this

In [16]:
X_train, X_test = cuml.model_selection.train_test_split(classified, test_size=0.3, random_state=0)

In [17]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [18]:
X_train['class'].value_counts()

2    29405
1     3190
Name: class, dtype: int32

In [19]:
X_test['class'].value_counts()

2    12614
1     1355
Name: class, dtype: int32

In [20]:
# Pull out the class column and then drop from th etraining set
Y_train = X_train[['class']]
X_train.drop(columns=['class'], inplace=True)

In [21]:
Y_test = X_test[['class']]
X_test.drop(columns=['class'], inplace=True)

### Use XGBoost

In [22]:
# Create a DMatrix
dtrain = xgb.DMatrix(X_train, Y_train)

In [23]:
# Train XGBoost
params = {
    'learning_rate'  : 0.3,
    'max_depth'      : 8,
    'objective'      : 'reg:squarederror',
    'subsample'      : 0.6,
    'gamma'          : 1,
    'silent'         : True,
    'verbose_eval'   : True,
    'tree_method'    :'gpu_hist'
}


In [24]:
trained_model = xgb.train(params, dtrain)

Parameters: { "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [25]:
# test
dtest = xgb.DMatrix(X_test, Y_test)

In [26]:
Y_test['prediction'] = trained_model.predict(dtest)

In [27]:
Y_test['squared_error'] = (Y_test['prediction'] - Y_test['class'])**2

In [28]:
Y_test.head()

Unnamed: 0,class,prediction,squared_error
0,2,1.954282,0.00209
1,2,1.950591,0.002441
2,2,1.912535,0.00765
3,2,1.933244,0.004456
4,2,1.955077,0.002018


In [29]:
Y_test[Y_test['class'] == 1].head()

Unnamed: 0,class,prediction,squared_error
10,1,1.861766,0.74264
19,1,1.113765,0.012942
27,1,1.217391,0.047259
39,1,0.995994,1.6e-05
51,1,0.984595,0.000237


In [30]:
# compute the actual RMSE over the full test set
RMSE = Y_test['squared_error'].mean()
math.sqrt(RMSE)

0.12961256075749394

In [31]:
Y_test[Y_test['prediction'] > 1.5]['class'].value_counts()

2    12595
1      229
Name: class, dtype: int32