#  Fraud Detection use XGBoost 

This notebook will demonstate using cuDF for ETL/data cleaning and XGBoost for training a fraud predection model.  
The processing will not use an additional graph processing or GNNs

In [1]:
# requiered imports
import cudf
import cuml
import xgboost as xgb
import math

  from pandas import MultiIndex, Int64Index


### Data Loading

In [2]:
# base directoty
base_dir = "./elliptic_bitcoin_dataset/"

In [3]:
# read the data files
df_features = cudf.read_csv(base_dir + 'elliptic_txs_features.csv', header=None)
df_classes  = cudf.read_csv(base_dir + "elliptic_txs_classes.csv")

# we do not need the edge dataset for this workflow
# df_edges    = cudf.read_csv(base_dir + "elliptic_txs_edgelist.csv")

### Let's look at the Class dataset

In [4]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


The documentation list class values as:</br>
1 = illicit</br>
2 = licit</br>
unknown = unknown </br>
</br>
It would be nice if the "class" column as an integer value and not string, so let's convert "unlnown" to 0


In [5]:
# replace the value and set the type to int32
df_classes['class'] = df_classes['class'].replace("unknown", "0").astype('int32')

In [6]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,0
1,5530458,0
2,232022460,0
3,232438397,2
4,230460314,0


### merge the classes into the feature dataset
but we might need to adjust the dataframe some

In [7]:
df_features.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [8]:
# change the column 0 name to be txId to match the classes dataframe
df_features.rename(columns={'0' : 'txId'}, inplace=True)

In [9]:
df_features.head(2)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792


In [10]:
# merging dataframes
df_merge = df_features.merge(df_classes, how='left', on='txId')

### Pull out the labeled data into groups for training, validating, and testing

In [11]:
classified   = df_merge.loc[df_merge['class'] != 0]
unclassified = df_merge.loc[df_merge['class'] == 0]

In [12]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,class
17,208590808,1,-0.171498,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162126,...,2.067047,-0.979074,-0.978556,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399,2
18,232437488,1,-0.045996,-0.183052,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.033719,...,3.177156,-0.979074,-0.978556,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399,2
26,230702954,1,-0.172683,-0.184668,-1.201369,-0.046932,-0.024025,-0.02914,-0.061584,-0.163634,...,-0.588384,-0.979074,-0.978556,-0.098889,-0.068266,-0.065421,-0.097524,0.699543,0.700804,2
30,230518236,1,-0.171233,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.161855,...,-0.575769,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
34,230424162,1,-0.169956,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.160548,...,-0.600999,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2


In [13]:
# reset the index 
classified.reset_index(inplace=True, drop=True)

In [14]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,class
0,208590808,1,-0.171498,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162126,...,2.067047,-0.979074,-0.978556,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399,2
1,232437488,1,-0.045996,-0.183052,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.033719,...,3.177156,-0.979074,-0.978556,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399,2
2,230702954,1,-0.172683,-0.184668,-1.201369,-0.046932,-0.024025,-0.02914,-0.061584,-0.163634,...,-0.588384,-0.979074,-0.978556,-0.098889,-0.068266,-0.065421,-0.097524,0.699543,0.700804,2
3,230518236,1,-0.171233,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.161855,...,-0.575769,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
4,230424162,1,-0.169956,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.160548,...,-0.600999,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2


### Split data into training and validation sets
cuML has a nice function for doing this

In [15]:
X_train, X_test = cuml.model_selection.train_test_split(classified, test_size=0.3, random_state=0)

In [16]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [17]:
X_train['class'].value_counts()

2    29424
1     3171
Name: class, dtype: int32

In [18]:
X_test['class'].value_counts()

2    12595
1     1374
Name: class, dtype: int32

In [19]:
# Pull out the class column and then drop from th etraining set
Y_train = X_train[['class']]
X_train.drop(columns=['class'], inplace=True)

In [20]:
Y_test = X_test[['class']]
X_test.drop(columns=['class'], inplace=True)

### Use XGBoost

In [21]:
# Create a DMatrix
dtrain = xgb.DMatrix(X_train, Y_train)

In [22]:
# Train XGBoost
params = {
    'learning_rate'  : 0.3,
    'max_depth'      : 8,
    'objective'      : 'reg:squarederror',
    'subsample'      : 0.6,
    'gamma'          : 1,
    'silent'         : True,
    'verbose_eval'   : True,
    'tree_method'    :'gpu_hist'
}


In [23]:
trained_model = xgb.train(params, dtrain)

Parameters: { "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [24]:
# test
dtest = xgb.DMatrix(X_test, Y_test)

In [25]:
Y_test['prediction'] = trained_model.predict(dtest)

In [26]:
Y_test['squared_error'] = (Y_test['prediction'] - Y_test['class'])**2

In [27]:
Y_test.head()

Unnamed: 0,class,prediction,squared_error
0,2,1.957891,0.001773
1,2,1.95633,0.001907
2,2,1.955731,0.00196
3,2,1.955731,0.00196
4,2,1.945992,0.002917


In [28]:
Y_test[Y_test['class'] == 1].head()

Unnamed: 0,class,prediction,squared_error
21,1,0.98756,0.000155
24,1,0.98756,0.000155
28,1,1.243482,0.059284
46,1,0.98756,0.000155
47,1,0.98756,0.000155


In [29]:
# compute the actual RMSE over the full test set
RMSE = Y_test['squared_error'].mean()
math.sqrt(RMSE)

0.12442908879507682

In [30]:
Y_test[Y_test['prediction'] > 1.5]['class'].value_counts()

2    12572
1      203
Name: class, dtype: int32