#  Fraud Detection use XGBoost 

This notebook will demonstate using cuDF for ETL/data cleaning and XGBoost for training a fraud predection model.  
The processing will not use an additional graph processing or GNNs

In [1]:
# requiered imports
import cudf
import cuml
import xgboost as xgb
import math

  from pandas import MultiIndex, Int64Index


### Data Loading

In [2]:
# base directoty
base_dir = "./elliptic_bitcoin_dataset/"

In [3]:
# read the data files
df_features = cudf.read_csv(base_dir + 'elliptic_txs_features.csv', header=None)
df_classes  = cudf.read_csv(base_dir + "elliptic_txs_classes.csv")

# we do not need the edge dataset for this workflow
# df_edges    = cudf.read_csv(base_dir + "elliptic_txs_edgelist.csv")

### Let's look at the Class dataset

In [4]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


The documentation list class values as:</br>
1 = illicit</br>
2 = licit</br>
unknown = unknown </br>
</br>
It would be nice if the "class" column as an integer value and not string, so let's convert "unlnown" to 0


In [5]:
# replace the value and set the type to int32
df_classes['class'] = df_classes['class'].replace("unknown", "0").astype('int32')

In [6]:
df_classes.head(5)

Unnamed: 0,txId,class
0,230425980,0
1,5530458,0
2,232022460,0
3,232438397,2
4,230460314,0


### merge the classes into the feature dataset
but we might need to adjust the dataframe some

In [7]:
df_features.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [8]:
# change the column 0 name to be txId to match the classes dataframe
df_features.rename(columns={'0' : 'txId'}, inplace=True)

In [9]:
# drop "aggregated features" 
df_features = df_features.iloc[:, 0:95]

In [10]:
df_features.head(2)

Unnamed: 0,96,97,98,99,100,101,102,103,104,105,...,157,158,159,160,161,162,163,164,165,166
0,-0.201584,-0.116817,-0.191472,-0.014659,-0.018849,-1.457953,-1.494057,-0.083459,-1.485972,-0.088798,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,-0.202332,-0.116817,-0.192405,-0.014659,-0.018849,-1.457921,-1.494024,-0.083459,-1.485939,-0.088798,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792


In [11]:
# merging dataframes
df_merge = df_features.merge(df_classes, how='left', on='txId')

KeyError: 'txId'

### Pull out the labeled data into groups for training, validating, and testing

In [None]:
classified   = df_merge.loc[df_merge['class'] != 0]
unclassified = df_merge.loc[df_merge['class'] == 0]

In [None]:
classified.head(5)

In [None]:
# reset the index 
classified.reset_index(inplace=True, drop=True)

In [None]:
classified.head(5)

### Split data into training and validation sets
cuML has a nice function for doing this

In [None]:
X_train, X_test = cuml.model_selection.train_test_split(classified, test_size=0.3, random_state=0)

In [None]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [None]:
X_train['class'].value_counts()

In [None]:
X_test['class'].value_counts()

In [None]:
# Pull out the class column and then drop from th etraining set
Y_train = X_train[['class']]
X_train.drop(columns=['class'], inplace=True)

In [None]:
Y_test = X_test[['class']]
X_test.drop(columns=['class'], inplace=True)

### Use XGBoost

In [None]:
# Create a DMatrix
dtrain = xgb.DMatrix(X_train, Y_train)

In [None]:
# Train XGBoost
params = {
    'learning_rate'  : 0.3,
    'max_depth'      : 8,
    'objective'      : 'reg:squarederror',
    'subsample'      : 0.6,
    'gamma'          : 1,
    'silent'         : True,
    'verbose_eval'   : True,
    'tree_method'    :'gpu_hist'
}


In [None]:
trained_model = xgb.train(params, dtrain)

In [None]:
# test
dtest = xgb.DMatrix(X_test, Y_test)

In [None]:
Y_test['prediction'] = trained_model.predict(dtest)

In [None]:
Y_test['squared_error'] = (Y_test['prediction'] - Y_test['class'])**2

In [None]:
Y_test.head()

In [None]:
Y_test[Y_test['class'] == 1].head()

In [None]:
# compute the actual RMSE over the full test set
RMSE = Y_test['squared_error'].mean()
math.sqrt(RMSE)

In [None]:
Y_test[Y_test['prediction'] > 1.5]['class'].value_counts()