# Demo

## Download data from Kaggle

### Install Kaggle python API

In [1]:
! pip install kaggle



### Authenticating with Kaggle 

In [2]:
! kaggle datasets download "ranjeetshrivastav/fraud-detection-dataset"

fraud-detection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
! unzip -o fraud-detection-dataset.zip

Archive:  fraud-detection-dataset.zip
  inflating: transactions.gz         
  inflating: transactions/transactions.txt  


In [6]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from nodevectors import Node2Vec
import xgboost as xgb
from fucc.inductive_step import inductive_pooling
from fucc.metrics import plot_ap, get_optimal_f1_cutoff, get_confusion_matrix
from sklearn.metrics import average_precision_score
import logging
logging.basicConfig(level=logging.INFO)

Dataset source: https://www.kaggle.com/ranjeetshrivastav/fraud-detection-dataset

In [7]:
# Parameters
dimensions = 32
walk_len = 80
walk_num = 10
window_size = 5
workers = 8

## Load Data

In [8]:
df = pd.read_json('/Users/raf/Dropbox/DOC/data/fraud_datasets/archive/transactions/transactions.txt',  lines=True, convert_dates=[4])

In [9]:
df.iloc[:, 4] = pd.to_datetime(df.iloc[:, 4])

In [10]:
df = df.sort_values('transactionDateTime')
df.loc[:, 'TX_ID'] = range(df.shape[0])

In [11]:
df = df.rename(columns={"merchantName":"TERM_MIDUID", "customerId":"CARD_PAN_ID", "isFraud": "TX_FRAUD" })

In [12]:
df_train = df.iloc[:400000]
df_test = df.iloc[400000:500000]

## Create network

In [13]:
G = nx.Graph()
G.add_nodes_from(df_train.TERM_MIDUID.unique(), type='merchant')
G.add_nodes_from(df_train.CARD_PAN_ID.unique(), type='cardholder')
G.add_nodes_from(df_train.TX_ID.unique(), type='transaction')

G.add_edges_from(zip(df_train.CARD_PAN_ID, df_train.TX_ID))
G.add_edges_from(zip(df_train.TX_ID, df_train.TERM_MIDUID))

In [14]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 407337
Number of edges: 800000
Average degree:   3.9280


## Deepwalk

In [15]:
# Fit embedding model to graph
g2v = Node2Vec(
    n_components=dimensions,
    walklen = walk_len,
    epochs = walk_num,
    w2vparams={'workers': workers, 'window': window_size}
)

g2v.fit(G)
model = g2v.model

Making walks... Done, T=80.57
Mapping Walk Names... Done, T=614.17


TypeError: __init__() got an unexpected keyword argument 'size'

In [None]:
embeddings = {}
for i in df_train.TX_ID:
    embeddings[i] = model.wv[str(i)]


embeddings = pd.DataFrame().from_dict(embeddings, orient='index')

In [None]:
df_train = df_train.merge(embeddings, left_on='TX_ID', right_index=True)

In [None]:
df_train.head()

## Inductive Pooling

In [None]:
results = inductive_pooling(df=df_test, embeddings=embeddings, G=G, workers=workers)

In [None]:
df_new_embeddings = pd.concat([pd.DataFrame(li).transpose() for li in results])

In [None]:
df_new_embeddings.index = df_test.TX_ID
df_test = df_test.merge(df_new_embeddings, left_on='TX_ID', right_index=True)

## XGBoost Classifier

In [None]:
embedding_features = [i for i in range(dimensions)]

In [None]:
X_train = df_train[embedding_features].iloc[:int(df_train.shape[0]*0.8)]
X_val = df_train[embedding_features].iloc[int(df_train.shape[0]*0.8):]
y_train = df_train.TX_FRAUD.iloc[:int(df_train.shape[0]*0.8)]
y_val = df_train.TX_FRAUD.iloc[int(df_train.shape[0]*0.8):]

X_test = df_test[embedding_features]
y_test = df_test.TX_FRAUD

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
xgb_params = {
    'eval_metric': ['auc','aucpr', 'logloss'],
    'objective':'binary:logistic',
    'n_estimators': 300,
    'n_jobs':8,
    'learning_rate':0.1,
    'seed':42,
    'colsample_bytree':0.6,
    'colsample_bylevel':0.9,
    'subsample':0.9
}

In [None]:
model = xgb.train(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'], evals=[(dval, 'val'), (dtrain, 'train')], early_stopping_rounds=int(xgb_params['n_estimators']/2))

In [None]:
y_pred_proba = model.predict(dtest)

## Evaluation

In [None]:
ap = average_precision_score(y_test, y_pred_proba)
print("Average Precision: ", np.round(ap,2))

In [None]:
fig = plot_ap(y_test, y_pred_proba)

In [None]:
optimal_threshold, optimal_f1_score = get_optimal_f1_cutoff(y_test, y_pred_proba)
print("F1 Score: ", np.round(optimal_f1_score, 4))

In [None]:
cm = get_confusion_matrix(y_test, y_pred_proba, optimal_threshold)
print("Confusion Matrix: \n", cm)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, y_pred_proba)