# Model for HackMX NDS Cognitive Labs Challenge
## Fraud Detection
#### Dataset obtained from IEEE-CIS Fraud Detection in Kaggle: 

In [2]:
import os
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import pickle
import time

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold

import lightgbm as lgb

In [3]:
train_identity = "data/train_identity.csv"
train_transaction = "data/train_transaction.csv"
test_identity = "data/test_identity.csv"
test_transaction = "data/test_transaction.csv"

In [4]:
%%time # visualize time to load data
train_id = pd.read_csv(train_identity)
train_tr = pd.read_csv(train_transaction)
test_id = pd.read_csv(test_identity)
test_tr = pd.read_csv(test_transaction)

CPU times: user 29.4 s, sys: 4.89 s, total: 34.3 s
Wall time: 34.5 s


#### Downcasts types to reduce memory and optimize usage.

In [10]:
def downcast_dtypes(df):
    _start = df.memory_usage(deep=True).sum() / 1024 ** 2
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    _end = df.memory_usage(deep=True).sum() / 1024 ** 2
    saved = (_start - _end) / _start * 100
    print(f"Saved {saved:.2f}%")
    return df

In [11]:
train_id = downcast_dtypes(train_id)
train_tr = downcast_dtypes(train_tr)
test_id = downcast_dtypes(test_id)
test_tr = downcast_dtypes(test_tr)

Saved 8.32%
Saved 40.53%
Saved 8.38%
Saved 40.20%


### Joining train tables

In [12]:
train = pd.merge(
    train_tr, train_id, how="left", on="TransactionID", left_index=True, right_index=True,
)

### Joining test tables as well

In [13]:
test = pd.merge(
    test_tr, test_id, how="left", on="TransactionID", left_index=True, right_index=True
)

### Taking a glimpse of how these tables look

In [14]:
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (590540, 434), Test shape: (506691, 433)


### We think these are the features of the model to build

In [15]:
imp_features = [
    "TransactionAmt",
    "ProductCD",
    "card1",
    "card2",
    "card3",
    "card5",
    "card6",
    "addr1",
    "addr2",
    "dist1",
    "dist2",
    "P_emaildomain",
    "R_emaildomain",
    "C1",
    "C2",
    "C4",
    "C5",
    "C6",
    "C7",
    "C8",
    "C9",
    "C10",
    "C11",
    "C12",
    "C13",
    "C14",
    "D1",
    "D2",
    "D3",
    "D4",
    "D5",
    "D10",
    "D11",
    "D15",
    "M1",
    "M2",
    "M3",
    "M4",
    "M6",
    "M7",
    "M8",
    "M9",
    "V1",
    "V3",
    "V4",
    "V6",
    "V8",
    "V11",
    "V13",
    "V14",
    "V17",
    "V20",
    "V23",
    "V26",
    "V27",
    "V30",
    "V36",
    "V37",
    "V40",
    "V41",
    "V44",
    "V47",
    "V48",
    "V54",
    "V56",
    "V59",
    "V62",
    "V65",
    "V67",
    "V68",
    "V70",
    "V76",
    "V78",
    "V80",
    "V82",
    "V86",
    "V88",
    "V89",
    "V91",
    "V107",
    "V108",
    "V111",
    "V115",
    "V117",
    "V120",
    "V121",
    "V123",
    "V124",
    "V127",
    "V129",
    "V130",
    "V136",
    "V138",
    "V139",
    "V142",
    "V147",
    "V156",
    "V160",
    "V162",
    "V165",
    "V166",
    "V169",
    "V171",
    "V173",
    "V175",
    "V176",
    "V178",
    "V180",
    "V182",
    "V185",
    "V187",
    "V188",
    "V198",
    "V203",
    "V205",
    "V207",
    "V209",
    "V210",
    "V215",
    "V218",
    "V220",
    "V221",
    "V223",
    "V224",
    "V226",
    "V228",
    "V229",
    "V234",
    "V235",
    "V238",
    "V240",
    "V250",
    "V252",
    "V253",
    "V257",
    "V258",
    "V260",
    "V261",
    "V264",
    "V266",
    "V267",
    "V271",
    "V274",
    "V277",
    "V281",
    "V283",
    "V284",
    "V285",
    "V286",
    "V289",
    "V291",
    "V294",
    "V296",
    "V297",
    "V301",
    "V303",
    "V305",
    "V307",
    "V309",
    "V310",
    "V314",
    "V320",
    "DeviceType",
    "DeviceInfo",
    "isFraud", ]

### Lets drop everything we are not using

In [17]:
cols_to_drop_train = [col for col in train.columns if col not in imp_features]
cols_to_drop_test = [col for col in test.columns if col not in imp_features]

train = train.drop(cols_to_drop_train, axis=1)
test = test.drop(cols_to_drop_test, axis=1)

### We need to replace all infinite values with 0

In [18]:
train = train.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [27]:
for col in train.columns:
    if train[col].dtype == "object":
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [28]:
print(train.shape)
print(test.shape)

(590540, 165)
(506691, 164)


In [29]:
X_train = train.drop("isFraud", axis=1).copy()
X_test = test.copy()
y_train = train["isFraud"].copy()

In [21]:
print(X_train.shape, X_test.shape, y_train.shape)

(590540, 164) (506691, 164) (590540,)


In [30]:
from sklearn.model_selection import train_test_split

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=7
)

### We are going to try out a RandomForest model

In [31]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    max_depth=45, max_features=30, n_estimators=500, n_jobs=-1, min_samples_leaf=200
)

In [26]:
%%time 
rf.fit(X_train_split, y_train_split)

ValueError: could not convert string to float: 'W'

In [None]:
print("Roc Auc Score:", roc_auc_score(y_test_split, rf.predict(X_test_split)))