In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.woe import WOEEncoder
from sklearn.compose import ColumnTransformer

The goal of this notebook is to provide a minimal working example of how you can pre-process your data for this problem. It's meant to be used as a starting point and was created for the reproducability sake for the blog that I'm writing.

## Load Data

In [None]:
train_transactions = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

In [None]:
# merge two datasets
train = pd.merge(train_transactions, train_identity, on='TransactionID', how='left')
train['isFraud'].value_counts(normalize=True)

In [None]:
train_transactions.head()

## Preprocess

### Feature Engineering

In [None]:
# Make an hour feature from datetime stamp (source:https://www.kaggle.com/ajaykgp12/ieee-cis-fraud-detection-lgb-with-fe#Categorical-Columns)
def make_hour_feature(f):
    #Creates an hour of the day feature, encoded as 0-23.  
    hours = f / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

train['hour'] = make_hour_feature(train['TransactionDT'])

TODO: Add more

### Data Types

In [None]:
cat_features = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
               'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1',
               'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo',
               'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
               'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
               'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

exclude = ['TransactionID', 'TransactionDT', 'isFraud']
num_features = [f for f in train.columns if (f not in cat_features) & (f not in exclude)]

# drop more than 90% NAs
col_na = train.isna().sum()
to_drop = col_na[(col_na / train.shape[0]) > 0.9].index

use_cols = [f for f in train.columns if f not in to_drop]
cat_features = [f for f in cat_features if f not in to_drop]
num_features = [f for f in num_features if f not in to_drop]

train[cat_features] = train[cat_features].astype(str)
train[num_features] = train[num_features].astype(np.float)
train = train[use_cols]

### Fill NAs

In [None]:
# fill numeric NAs with median
median_values = train[num_features].median() 
train[num_features] = train[num_features].fillna(median_values)

# fill categorical NAs with "missing"
train[cat_features] = train[cat_features].replace("nan", "missing")

train.isna().sum().sum()

### Split Train/Val/Test

In [None]:
data = train.drop(columns=['TransactionID', 'TransactionDT'])

target = 'isFraud'
num_features = data.select_dtypes(include=np.number).columns
cat_features = data.select_dtypes(exclude=np.number).columns

num_features = [f for f in num_features if f != target]

In [None]:
train_X, test_X, train_y, test_y = train_test_split(data[num_features+list(cat_features)], data['isFraud'], test_size=0.2)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)
print(len(train_X), 'train examples')
print(len(val_X), 'validation examples')
print(len(test_X), 'test examples')

### CategoricalEncoding

In [None]:
to_ohe=[]
to_emb=[]
for c in cat_features:
    if train_X[c].nunique() < 5:
        to_ohe.append(c)
    else:
        to_emb.append(c)

In [None]:
# Numeric columns will be scaled by StandardScaler
scaler = StandardScaler()

# Categorical with < 5 unique values will be One Hot Encoded
ohe = OneHotEncoder(handle_unknown='ignore')

# Categorical with >= 5 unique values will be encoded using Weight of Evidence
woe = WOEEncoder()

column_trans = ColumnTransformer(
    [ ('scaler',scaler, num_features),
    ('ohe', ohe, to_ohe),
    ('woe', woe, to_emb)], remainder='passthrough', n_jobs=-1)

train_X_transformed = column_trans.fit_transform(train_X, train_y)
val_X_transformed = column_trans.transform(val_X)
test_X_transformed = column_trans.transform(test_X)

print(train_X_transformed.shape, val_X_transformed.shape, test_X_transformed.shape)

Now you can use this data for any type of modelling.