- To reduce CPU load, we are using kernel output from this kernel : https://www.kaggle.com/mjbahmani/reducing-memory-size-for-ieee
- We are using some fastai v0.7 functions for preprocessing etc, hence I've added the structured.py file as a utility script : https://www.kaggle.com/priteshshrivastava/fastai-structured
- This kernel is focussed on model interpretation using Permutation Feature Importance, Partial Dependence Plots and SHAP values.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np 
import pandas as pd
from IPython.display import display
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import os
from pandas_summary import DataFrameSummary
from matplotlib import pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
import re

import shap
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots

import IPython
from IPython.display import display
print(os.listdir("../input/"))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['ieee-fraud-detection', 'reducing-memory-size-for-ieee']


In [3]:
train_df = pd.read_csv("../input/reducing-memory-size-for-ieee/train.csv")
test_df = pd.read_csv("../input/reducing-memory-size-for-ieee/test.csv")
train_df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,99,150,discover,142,credit,...,,255,,,,,,,,
1,0,86401,29.0,W,2755,404,150,mastercard,102,credit,...,,255,,,,,,,,
2,0,86469,59.0,W,4663,490,150,visa,166,debit,...,,255,,,,,,,,
3,0,86499,50.0,W,18132,567,150,mastercard,117,debit,...,,255,,,,,,,,
4,0,86506,50.0,H,4497,514,150,mastercard,102,credit,...,samsung browser 6.2,32,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [4]:
test_df.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.95,W,10409,111,150,visa,226,debit,170,...,,7,,,,,,,,
1,18403263,49.0,W,4272,111,150,visa,226,debit,299,...,,7,,,,,,,,
2,18403310,171.0,W,4476,574,150,visa,226,debit,472,...,,7,,,,,,,,
3,18403310,284.95,W,10989,360,150,visa,166,debit,205,...,,7,,,,,,,,
4,18403317,67.95,W,18018,452,150,mastercard,117,debit,264,...,,7,,,,,,,,


We'll just use a Random Forest Classifier. For that, we need to convert all columns to numeric type. But there are some categorical variables too.

In [5]:
import fastai_structured   ## Adding structured.py from fastai v0.7 as a utility script to the kernel
fastai_structured.train_cats(train_df)
fastai_structured.apply_cats(test_df, train_df)

We'll replace categories with their numeric codes, handle missing continuous values, and split the dependent variable into a separate variable. Fastai to the rescue again !!

In [6]:
nas = {}
df_trn, y_trn, nas = fastai_structured.proc_df(train_df, 'isFraud', na_dict=nas)   ## Avoid creating NA columns as total cols may not match later
df_test, _, _ = fastai_structured.proc_df(test_df, na_dict=nas)
df_trn.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,86400,68.5,5,13926,99,150,2,142,2,315,...,0,255,0,0,0,0,0,0,0,0
1,86401,29.0,5,2755,404,150,3,102,2,325,...,0,255,0,0,0,0,0,0,0,0
2,86469,59.0,5,4663,490,150,4,166,3,330,...,0,255,0,0,0,0,0,0,0,0
3,86499,50.0,5,18132,567,150,3,117,3,476,...,0,255,0,0,0,0,0,0,0,0
4,86506,50.0,2,4497,514,150,3,102,2,420,...,124,32,165,4,2,1,2,2,2,955


In [7]:
df_test.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.95,5,10409,111,150,4,226,3,170,...,0,7,0,0,0,0,0,0,0,0
1,18403263,49.0,5,4272,111,150,4,226,3,299,...,0,7,0,0,0,0,0,0,0,0
2,18403310,171.0,5,4476,574,150,4,226,3,472,...,0,7,0,0,0,0,0,0,0,0
3,18403310,284.95,5,10989,360,150,4,166,3,205,...,0,7,0,0,0,0,0,0,0,0
4,18403317,67.95,5,18018,452,150,3,117,3,264,...,0,7,0,0,0,0,0,0,0,0


In [8]:
del train_df, test_df

## To handle imbalanced datasets, we'll use [resampling](https://www.kaggle.com/shahules/tackling-class-imbalance)

In [9]:
from imblearn.under_sampling import RandomUnderSampler

ran=RandomUnderSampler(return_indices=True) ##intialize to return indices of dropped rows
df_trn_sm,y_trn_sm,dropped = ran.fit_sample(df_trn,y_trn)

#print("The number of removed indices are ",len(dropped))
#plot_2d_space(X_rs,y_rs,X,y,'Random under sampling')

### Split the data into training and validation sets

In [10]:
train_X, val_X, train_y, val_y = train_test_split(df_trn_sm, y_trn_sm, test_size=0.33, random_state=42)

### Defining function to calculate the evaluation metric

In [11]:
from sklearn.metrics import roc_auc_score

def print_score(m):
    res = [roc_auc_score(m.predict(train_X), train_y), roc_auc_score(m.predict(val_X), val_y)]
    print(res)

We can now pass this processed data frame to Random Forest Classifier.

In [12]:
##To reduce CPU load, and for faster iteration
fastai_structured.set_rf_samples(200000)
del df_trn, y_trn, df_trn_sm, y_trn_sm

Initially, let's just fit a single decision tree to visualize it properly

In [13]:
%time
m = RandomForestClassifier(n_estimators=1, min_samples_leaf=5, max_depth = 3) ## Use all CPUs available
m.fit(train_X, train_y)

print_score(m)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.6 µs
[0.7352222765736182, 0.7282575645287098]


In [14]:
def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    """ Draws a representation of a random forest in IPython.
    Parameters:
    -----------
    t: The tree you wish to draw
    df: The data used to train the tree. This is used to get the names of the features.
    """
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

In [15]:
#draw_tree(m.estimators_[0], train_X, precision=3)

A single decision tree did not perform so badly. You can read more about the gini impurity metric [here](https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity).

Now, let's bag a collection of trees to create a random forest.

In [16]:
%time
m = RandomForestClassifier(n_estimators=30, min_samples_leaf=20, max_features=0.7, 
                                n_jobs=-1, oob_score=True) ## Use all CPUs available
m.fit(train_X, train_y)

print_score(m)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.2 µs
[0.9000695984284416, 0.8350216935360076]


## Submitting Predictions

In [17]:
## pred = m.predict(df_test)          ## Gets an AUC of ~0.8
pred = m.predict_proba(df_test)[:,1]  ## Gets an AUC of ~0.9
submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [18]:
submission['isFraud'] = pred   
submission.to_csv('rf_submission_vf.csv', index=False)