Kindly do the following steps before starting this notebook:
1. On the right side of the screen, under "<b>Data</b>", click on "<b>+ Add data</b>"
2. Click on the "<b>Competition Data</b>"
3. Search "<b>IEEE-CIS Fraud Detection</b>" and click on "<b>Add</b>" on the search result
4. The dataset should appear in the dropdown of the "<b>Data</b>" tab

In [None]:
import time # to measure time

import pandas as pd # CPU


import xgboost as xgb # model

from tqdm import tqdm # to display progress bar

# CPU
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




# visualisation
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from xgboost import plot_importance, plot_tree

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1. Load Data

In [None]:
%%time

# CPU
train_transaction = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv", index_col="TransactionID")
train_identity = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv", index_col="TransactionID")

In [None]:
# look at the first 5 rows of the df
train_transaction.head()

In [None]:
train_identity.head()

**Merge dataframes**

* We merge the original (left) dataframe with another (right)
* `left`: use the keys from left frame, similar to a SQL left outer join; preserve key order
* `left_index` and `right_index`: use the index from the left and right dataframes as the join keys

In [None]:
%%time

# CPU
train_data = train_transaction.merge(train_identity, how="left", left_index=True, right_index=True)

In [None]:
# CPU
train_data.info()

In [None]:
# CPU
# datetime of transaction probably doesn't matter
train_data.drop(["TransactionDT"], axis=1, inplace=True) # remove column from df

# label
y = train_data["isFraud"].astype("int") # convert to int dtype
y.head()

In [None]:
# CPU
# features (data)
x = train_data.drop("isFraud", axis=1) # remove label from feature set
# xgboost can handle missing data as a seperate case
# you can also fill in missing values
#x = x.fillna(-999.0) #  large negative number simulates null/missing values

# counters
converted_to_float = 0
converted_to_categorical = 0
dropped = 0

for col in tqdm(x.columns): # displays progress bar
    try:
        x[col] = x[col].astype("float") # convert column to float dtype
        converted_to_float += 1
    except:
        #try:
        le = LabelEncoder()
        x_col = x[col].str.lower() # change string to lowercase
        new_col = le.fit_transform(x_col) # transform non-numerical values (0 to n)
        x[col] = new_col.astype("float")
        converted_to_categorical += 1
        #except:
        #    x.drop([col], axis=1, inplace=True)
        #    dropped += 1

In [None]:
x.head()

In [None]:
print("converted_to_float:", converted_to_float)
print("converted_to_categorical:", converted_to_categorical)
print("cols dropped:", dropped)

# 2. Train Model

In [None]:
%%time

# CPU
# Make train test split
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.1, stratify=y)

In [None]:
del x, train_data, train_transaction, train_identity
!free -h
# !nvidia-smi

In [None]:
print("CPU")
print("Train:", X_train.shape, y_train.shape)
print(" Test:", X_valid.shape, y_valid.shape)
print("*"*10)



In [None]:
# get ratio of positive cases (1) to negative cases (0)
ratio_of_positive = y_train.sum()/(len(y_train)-y_train.sum())
print(round(ratio_of_positive,3), ": 1 ratio of positive:negative")

Construct the XGBoost Classifier. You can also try out [additional arguments](https://xgboost.readthedocs.io/en/latest/parameter.html) to perform parameter tuning.

In [None]:
# CPU
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    tree_method="hist",
    # scale up the weight of positive samples
    scale_pos_weight=1/ratio_of_positive,
    use_label_encoder=False
)

In [None]:
# CPU
train_start_time = time.time()

model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid, y_valid)],
          verbose=False)

train_end_time = time.time()

results = model.evals_result()

print("Training takes:", int(train_end_time-train_start_time), "seconds")

# 3. Evaluation

In [None]:
%%time

# CPU
preds = model.predict(X_valid)

# 4. Visualization

In [None]:
# CPU
# plot learning curves
plt.plot(results['validation_0']['logloss'], label='train')
plt.plot(results['validation_1']['logloss'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
# CPU
# confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(
    y_valid,
    preds,
    display_labels=["not_fraud", "fraud"],
    cmap=plt.cm.Blues,
    normalize="true",
)
disp.ax_.set_title("Confusion Matrix - Validation")
plt.show()

In [None]:
# CPU
# plot feature importance

plot_importance(model, max_num_features=10)
plt.show()