In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from pandas_profiling import ProfileReport
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IEEE-CIS Fraud Detection — LightGBM Split Points 
https://www.kaggle.com/code/jtrotman/ieee-fraud-adversarial-lgb-split-points

Imagine standing at the check-out counter at the grocery store with a long line behind you and the cashier not-so-quietly announces that your card has been declined. In this moment, you probably aren’t thinking about the data science that determined your fate.

Embarrassed, and certain you have the funds to cover everything needed for an epic nacho party for 50 of your closest friends, you try your card again. Same result. As you step aside and allow the cashier to tend to the next customer, you receive a text message from your bank. “Press 1 if you really tried to spend $500 on cheddar cheese.”

While perhaps cumbersome (and often embarrassing) in the moment, this fraud prevention system is actually saving consumers millions of dollars per year. Researchers from the IEEE Computational Intelligence Society (IEEE-CIS) want to improve this figure, while also improving the customer experience. With higher accuracy fraud detection, you can get on with your chips without the hassle.

IEEE-CIS works across a variety of AI and machine learning areas, including deep neural networks, fuzzy systems, evolutionary computation, and swarm intelligence. Today they’re partnering with the world’s leading payment service company, Vesta Corporation, seeking the best solutions for fraud prevention industry, and now you are invited to join the challenge.

In this competition, you’ll benchmark machine learning models on a challenging large-scale dataset. The data comes from Vesta's real-world e-commerce transactions and contains a wide range of features from device type to product features. You also have the opportunity to create new features to improve your results.

If successful, you’ll improve the efficacy of fraudulent transaction alerts for millions of people around the world, helping hundreds of thousands of businesses reduce their fraud loss and increase their revenue. And of course, you will save party people just like you the hassle of false positives.


Acknowledgements:

Vesta Corporation provided the dataset for this competition. Vesta Corporation is the forerunner in guaranteed e-commerce payment solutions. Founded in 1995, Vesta pioneered the process of fully guaranteed card-not-present (CNP) payment transactions for the telecommunications industry. Since then, Vesta has firmly expanded data science and machine learning capabilities across the globe and solidified its position as the leader in guaranteed ecommerce payments. Today, Vesta guarantees more than $18B in transactions annually.

# train_transaction.csv

TransactionDT: timedelta from a given reference datetime (not an actual timestamp)

TransactionAMT: transaction payment amount in USD

ProductCD: product code, the product for each transaction

card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.

addr: address

dist: distance

P_ and (R__) emaildomain: purchaser and recipient email domain

C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.

D1-D15: timedelta, such as days between previous transaction, etc.

M1-M9: match, such as names on card and address, etc.

Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

## Categorical Features:
ProductCD

card1 - card6

addr1, addr2

P_emaildomain

R_emaildomain

M1 - M9



In [None]:
df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
df.head(10)

In [None]:
print('Number of fraud transactions: ', df['isFraud'].sum(), ' (of ', len(df), ')')
print('Percentage of fraud: ', df['isFraud'].sum()/ len(df)*100, '%')

In [None]:
df.describe()

In [None]:
visualization = 0
prof=''

In [None]:
#TransactionDT: transaction payment amount in USD
if visualization: 
    prof=ProfileReport(df[['TransactionDT']], minimal=True)
    
#no missing values

prof

In [None]:
#TransactionAMT: transaction payment amount in USD
if visualization:
    prof=ProfileReport(df[['TransactionAmt']], minimal=True)

#no missing values

prof

In [None]:
#ProductCD: product code, the product for each transaction
if visualization: 
    prof=ProfileReport(df[['ProductCD']], minimal=True)
    #no missing values

prof

In [None]:
#card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
cols_card = [c for c in df.columns if c.startswith('card')]
print(df[cols_card].head(3))
    
if visualization:
    prof=ProfileReport(df[cols_card], minimal=True)
    
# card2 1.5% missing, finna with max+1
df[cols_card] = df[cols_card].fillna(df[cols_card].max())

prof

In [None]:
#addr: address
cols_addr = [c for c in df.columns if c.startswith('addr')]
print(df[cols_addr].head(3))

if visualization:
    prof=ProfileReport(df[cols_addr], minimal=True)

#missing ~ 11%, fillna with max
df[cols_addr] = df[cols_addr].fillna(df[cols_addr].max())

prof

In [None]:
#dist: distance
cols_dist = [c for c in df.columns if c.startswith('dist')]
print(df[cols_dist].head(3))

if visualization:
    prof=ProfileReport(df[cols_dist], minimal=True)
    
#missing 93%!!, fillna with max
df[cols_dist] = df[cols_dist].fillna(df[cols_dist].max())

prof

In [None]:
#P_ and (R__) emaildomain: purchaser and recipient email domain
cols_email = [c for c in df.columns if c.endswith('emaildomain')]
print(df[cols_email].head(3))

if visualization:
    prof=ProfileReport(df[cols_email], minimal=True)

# 76% missing, fillna with 'missing'
df[cols_email] = df[cols_email].fillna('missing') #df[cols_email].head(3)

prof

In [None]:
#C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
cols_count = [c for c in df.columns if c.startswith('C')]
print(df[cols_count].head(3))

if visualization:
    prof=ProfileReport(df[cols_count], minimal=True)

#no missing values

prof

In [None]:
#D1-D15: timedelta, such as days between previous transaction, etc.

cols_timedelta = [c for c in df.columns if c.startswith('D')]
print(df[cols_timedelta].head(3))

if visualization:
    prof=ProfileReport(df[cols_timedelta], minimal=True)

#58% missing, fillna with max at column level
df[cols_timedelta] = df[cols_timedelta].fillna(df[cols_timedelta].max())

prof

In [None]:
#M1-M9: match, such as names on card and address, etc.
cols_match = [c for c in df.columns if c.startswith('M')]
print(df[cols_match].head(3))

if visualization:
    prof=ProfileReport(df[cols_match], minimal=True)

#50% missing, fillna with 'missing'
df[cols_match] = df[cols_match].fillna('missing')

prof

In [None]:
#Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
cols_vesta = [c for c in df.columns if c.startswith('V')]
print(df[cols_vesta].head(3))

if visualization:
    prof=ProfileReport(df[cols_vesta], minimal=True)

#43% missing, fillna with average value ????!!!! (Have no idea what this values represent... try also with min or max?!)
df[cols_vesta] = df[cols_vesta].fillna(df[cols_vesta].mean())

prof

# Training with Isolation Forest H2O

### Prepare training / testing data

In [None]:
n = np.min([100000, len(df)])
df = df.iloc[0:n]
df

In [None]:
y = df['isFraud']
X = df.drop('isFraud', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train)

### Train

In [None]:
import h2o
from h2o.estimators import H2OIsolationForestEstimator
h2o.init()

# Import the prostate dataset
h2o_df = h2o.import_file("https://raw.github.com/h2oai/h2o/master/smalldata/logreg/prostate.csv")

# Split the data giving the training dataset 75% of the data
#train,test = h2o_df.split_frame(ratios=[0.75])
X_train_h2o = h2o.H2OFrame(X_train)
X_test_h2o  = h2o.H2OFrame(X_test)


# Build an Isolation forest model
model = H2OIsolationForestEstimator(sample_rate = 0.1,
                                    max_depth = 20,
                                    ntrees = 50)
model.train(training_frame=X_train_h2o)

# Calculate score
score = model.predict(X_test_h2o)
result_pred = score["predict"]

# Predict the leaf node assignment
#ln_pred = model.predict_leaf_node_assignment(X_test_h2o, "Path")

import os 
# save the model
model_path = h2o.save_model(model) #, path=os.getcwd(), force= 1)
print('model saved here: ', model_path)

In [None]:
df_res = y_test.reset_index()
df_res['pred'] = score["predict"].as_data_frame()
df_res['pred_bin'] = (df_res['pred'] >0.5 ) *1
print(df_res.describe())
df_res

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error

c_acc = accuracy_score(df_res['isFraud'], df_res['pred_bin'])
mse = mean_squared_error(df_res['isFraud'], df_res['pred_bin'])
print('Accuracy: ', c_acc)
print('Mean square error: ', mse)