#  Fraud Detection use XGBoost with Graph Features

This notebook will demonstate using cuDF for ETL/data cleaning and XGBoost for training a fraud predection model.  
The processing will not use an additional graph processing or GNNs

In [1]:
# Since we are adding graph, need to import cuGraph
import cugraph

In [2]:
# requiered imports
import cudf
import cuml
import xgboost as xgb
import math

  from pandas import MultiIndex, Int64Index


## Let's only look at the Graph part for now - the rest will be similar to what we have already done

In [3]:
# base directoty
base_dir = "./elliptic_bitcoin_dataset/"

In [4]:
df_edges = cudf.read_csv(base_dir + "elliptic_txs_edgelist.csv")

In [5]:
df_edges.head(5)

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206


In [6]:
# does the data start at 0?
(df_edges['txId1'].min(), df_edges['txId2'].min())

(1076, 1076)

The data starts at 1076 and not 0 - that means that we need to use the renumbering feature of cuGraph to create
a contiguous sequence

In [19]:
# create a directed graph - could just as easily create an undirected graph 
G = cugraph.Graph(directed=True)

In [20]:
G.from_cudf_edgelist(df_edges, source='txId1', destination='txId2', renumber=True)

In [21]:
print(f"number of nodes: {G.number_of_nodes()}")
print(f"number of edges: {G.number_of_edges()}")

number of nodes: 203769
number of edges: 234355


In [22]:
# that is a very odd graph
# let's look at degree centrality

In [23]:
degree = G.degrees()

In [24]:
degree.head(5)

Unnamed: 0,in_degree,out_degree,vertex
0,0,2,155552378
1,0,2,155554163
2,0,2,155558582
3,1,2,155560604
4,1,2,155562719


In [25]:
degree.describe()

Unnamed: 0,in_degree,out_degree,vertex
count,203769.0,203769.0,203769.0
mean,1.150101,1.150101,171131000.0
std,3.911132,1.89474,110465500.0
min,0.0,0.0,1076.0
25%,0.0,1.0,84334520.0
50%,1.0,1.0,162437500.0
75%,1.0,1.0,245479800.0
max,284.0,472.0,403244600.0


In [26]:
# this is a good indication that there are disjoint subgraphs in the data

In [27]:
comp = cugraph.weakly_connected_components(G)

In [28]:
# Use groupby on the 'labels' column of the WCC output to get the counts of each connected component with the same label
label_count = comp.groupby('labels').count()
label_count.rename(columns={"vertex": "count"}, inplace=True)

print("Total number of components found : ", "{:,}".format(len(label_count)))

Total number of components found :  49


In [29]:
# The fact that there are multiple components impact the creationg of metrics.  Many algorithm normalize results
# based on the size of data, but for good metrics the size should be that of the component.  

# let's ignore the fact thgat there are multiple components and not normalize

In [30]:
bc = cugraph.betweenness_centrality(G, k=1000, normalized=False)

In [31]:
pr   = cugraph.pagerank(G)
hits = cugraph.hits(G, normalized=False)
katz = cugraph.katz_centrality(G, normalized=False)

In [32]:
# merge all the data together

In [33]:
graph_metrics = degree.merge(pr, on="vertex")
graph_metrics = graph_metrics.merge(bc ,on="vertex")
graph_metrics = graph_metrics.merge(hits ,on="vertex")
graph_metrics = graph_metrics.merge(katz ,on="vertex")


In [34]:
# Now rename "vertex" to be "txId"
graph_metrics.rename(columns={'vertex' : 'txId'}, inplace=True)

In [35]:
graph_metrics.head(5)

Unnamed: 0,in_degree,out_degree,txId,pagerank,betweenness_centrality,hubs,authorities,katz_centrality
0,1,2,4643104,6e-06,0.0,6.2477779999999996e-89,1.715789e-97,0.002215
1,1,2,4645502,3e-06,0.0,1.949035e-24,1.715789e-97,0.002215
2,1,2,4645988,2e-06,0.0,6.2477779999999996e-89,1.569401e-60,0.002215
3,1,2,4646378,3e-06,0.0,2.366557e-24,1.715789e-97,0.002215
4,0,2,4647600,2e-06,0.0,2.211437e-26,0.0,0.00221


### The XGBoost part

### Data Loading

In [36]:
# read the data files
df_features = cudf.read_csv(base_dir + 'elliptic_txs_features.csv', header=None)
df_classes  = cudf.read_csv(base_dir + "elliptic_txs_classes.csv")

In [37]:
# replace the value and set the type to int32
df_classes['class'] = df_classes['class'].replace("unknown", "0").astype('int32')

### merge the classes into the feature dataset
but we might need to adjust the dataframe some

In [38]:
# change the column 0 name to be txId to match the classes dataframe
df_features.rename(columns={'0' : 'txId'}, inplace=True)

In [39]:
# merging dataframes
df_merge = df_features.merge(df_classes, how='left', on='txId')

In [40]:
# Now add  the graph features
df_merge = df_merge.merge(graph_metrics, how='left', on='txId')

### Pull out the labeled data into groups for training, validating, and testing

In [41]:
classified   = df_merge.loc[df_merge['class'] != 0]
unclassified = df_merge.loc[df_merge['class'] == 0]

In [42]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,165,166,class,in_degree,out_degree,pagerank,betweenness_centrality,hubs,authorities,katz_centrality
1,121298347,6,0.136187,-0.184626,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,0.152681,...,-1.760926,-1.760984,2,0,1,2e-06,0.0,4.5458559999999994e-100,0.0,0.00221
3,121655492,6,-0.172834,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.163494,...,-0.120613,-0.119792,2,1,0,4e-06,0.0,0.0,1.179081e-86,0.002215
9,9676808,6,-0.006516,-0.132897,-1.201369,-0.12197,0.015676,-0.113002,-0.061584,0.006676,...,-0.865922,-0.776269,2,1,0,2e-06,0.0,0.0,1.865015e-32,0.002215
13,8986809,6,-0.15546,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.160758,...,-0.120613,-0.119792,2,1,0,6e-06,0.0,0.0,1.179081e-86,0.002215
22,8988238,6,-0.170733,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.163342,...,-0.120613,-0.119792,2,3,0,7e-06,0.0,0.0,1.965266e-76,0.002224


In [43]:
# reset the index 
classified.reset_index(inplace=True, drop=True)

In [44]:
classified.head(5)

Unnamed: 0,txId,1,2,3,4,5,6,7,8,9,...,165,166,class,in_degree,out_degree,pagerank,betweenness_centrality,hubs,authorities,katz_centrality
0,121298347,6,0.136187,-0.184626,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,0.152681,...,-1.760926,-1.760984,2,0,1,2e-06,0.0,4.5458559999999994e-100,0.0,0.00221
1,121655492,6,-0.172834,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.163494,...,-0.120613,-0.119792,2,1,0,4e-06,0.0,0.0,1.179081e-86,0.002215
2,9676808,6,-0.006516,-0.132897,-1.201369,-0.12197,0.015676,-0.113002,-0.061584,0.006676,...,-0.865922,-0.776269,2,1,0,2e-06,0.0,0.0,1.865015e-32,0.002215
3,8986809,6,-0.15546,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.160758,...,-0.120613,-0.119792,2,1,0,6e-06,0.0,0.0,1.179081e-86,0.002215
4,8988238,6,-0.170733,-0.184668,-1.201369,0.103143,-0.063725,0.138585,-0.061584,-0.163342,...,-0.120613,-0.119792,2,3,0,7e-06,0.0,0.0,1.965266e-76,0.002224


### Split data into training and validation sets
cuML has a nice function for doing this

In [45]:
X_train, X_test = cuml.model_selection.train_test_split(classified, test_size=0.15, random_state=0)

In [46]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [47]:
X_train['class'].value_counts()

2    35717
1     3863
Name: class, dtype: int32

In [48]:
X_test['class'].value_counts()

2    6302
1     682
Name: class, dtype: int32

In [49]:
# Pull out the class column and then drop from th etraining set
Y_train = X_train[['class']]
X_train.drop(columns=['class'], inplace=True)

In [50]:
Y_test = X_test[['class']]
X_test.drop(columns=['class'], inplace=True)

### Use XGBoost

In [51]:
# Create a DMatrix
dtrain = xgb.DMatrix(X_train, Y_train)

In [52]:
# Train XGBoost
params = {
    'learning_rate'  : 0.3,
    'max_depth'      : 8,
    'objective'      : 'reg:squarederror',
    'subsample'      : 0.6,
    'gamma'          : 1,
    'silent'         : True,
    'verbose_eval'   : True,
    'tree_method'    :'gpu_hist'
}


In [53]:
trained_model = xgb.train(params, dtrain)

Parameters: { "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [54]:
# test
dtest = xgb.DMatrix(X_test, Y_test)

In [55]:
Y_test['prediction'] = trained_model.predict(dtest)

In [56]:
Y_test['squared_error'] = (Y_test['prediction'] - Y_test['class'])**2

In [57]:
Y_test.head()

Unnamed: 0,class,prediction,squared_error
0,2,1.95498,0.002027
1,2,1.943948,0.003142
2,1,1.1226,0.015031
3,2,1.942405,0.003317
4,2,1.953738,0.00214


In [58]:
Y_test[Y_test['class'] == 1].head()

Unnamed: 0,class,prediction,squared_error
2,1,1.1226,0.015031
8,1,1.34398,0.118322
12,1,1.001384,2e-06
16,1,0.985659,0.000206
18,1,0.985659,0.000206


In [59]:
# compute the actual RMSE over the full test set
RMSE = Y_test['squared_error'].mean()
math.sqrt(RMSE)

0.11934296013264774

In [60]:
Y_test[Y_test['prediction'] > 1.5]['class'].value_counts()

2    6295
1      84
Name: class, dtype: int32