Previous kernel - https://www.kaggle.com/priteshshrivastava/ieee-pipeline-1-create-validation-set

Input - Train & val, test CSVs

Output - Val & Test preds

Next kernel - Meta model https://www.kaggle.com/priteshshrivastava/ieee-pipeline-3-stacking-with-meta-model

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
from sklearn.metrics import roc_auc_score
import pickle
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import fastai_structured as fs

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/__output__.json
/kaggle/input/ieee-pipeline-1-create-validation-set/test_df.pkl
/kaggle/input/ieee-pipeline-1-create-validation-set/val_X.pkl
/kaggle/input/ieee-pipeline-1-create-validation-set/__notebook__.ipynb
/kaggle/input/ieee-pipeline-1-create-validation-set/train_y.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/__results__.html
/kaggle/input/ieee-pipeline-1-create-validation-set/custom.css
/kaggle/input/ieee-pipeline-1-create-validation-set/val_y.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/train_X.pkl


In [2]:
train_X = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/train_X.pkl")
train_y = pd.read_csv("/kaggle/input/ieee-pipeline-1-create-validation-set/train_y.csv")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
train_y.head()

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0


In [4]:
val_X = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/val_X.pkl")
val_y = pd.read_csv("/kaggle/input/ieee-pipeline-1-create-validation-set/val_y.csv")

In [5]:
test_df = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/test_df.pkl")
test_df.head()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.950001,W,10409,111,150,visa,226,debit,170,...,,7,,,,,,,,
3663550,18403263,49.0,W,4272,111,150,visa,226,debit,299,...,,7,,,,,,,,
3663551,18403310,171.0,W,4476,574,150,visa,226,debit,472,...,,7,,,,,,,,
3663552,18403310,284.950012,W,10989,360,150,visa,166,debit,205,...,,7,,,,,,,,
3663553,18403317,67.949997,W,18018,452,150,mastercard,117,debit,264,...,,7,,,,,,,,


### Specify & fit models on training set

In [6]:
fs.train_cats(train_X)
fs.apply_cats(val_X, train_X)
fs.apply_cats(test_df, train_X)

In [7]:
nas = {}
df_trn, _, nas = fs.proc_df(train_X, na_dict=nas)   ## Avoid creating NA columns as total cols may not match later
df_test, _, _ = fs.proc_df(test_df, na_dict=nas)
df_val, _, _ = fs.proc_df(val_X, na_dict = nas)
df_trn.head()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3288253,7432176,730.0,5,16255,470,150,3,137,2,299,...,0,255,0,0,0,0,0,0,0,0
3004937,488702,50.0,2,11162,346,150,3,224,3,299,...,88,32,100,2,2,1,1,2,2,1224
3176071,4223868,280.0,5,16560,476,150,4,166,3,420,...,0,255,0,0,0,0,0,0,0,0
3422350,11024794,25.950001,5,16727,111,150,4,226,2,177,...,0,255,0,0,0,0,0,0,0,0
3514576,13883754,30.0,2,6021,321,150,4,226,2,299,...,102,24,51,3,2,1,1,2,1,502


### Defining function to calculate the evaluation metric

In [8]:
def auc(x,y): 
    return roc_auc_score(x,y)
def print_score(m):
    res = [auc(m.predict(df_trn), train_y), auc(m.predict(df_val), val_y)]
    print(res)

In [9]:
modelB = RandomForestClassifier(n_estimators=30, min_samples_leaf=20, max_features=0.7, 
                                n_jobs=-1, oob_score=True) ## Use all CPUs available

In [10]:
modelB.fit(df_trn, train_y)

  """Entry point for launching an IPython kernel.
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=0.7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
                       oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [11]:
print_score(modelB)

[0.9473927767384037, 0.9312185899161053]


### Make predictions on validation AND test set

In [12]:
predsB = pd.Series(modelB.predict(df_val))

In [13]:
test_predsB = pd.Series(modelB.predict(df_test))

### Storing val & test pred

In [14]:
predsB.to_csv("predsB.csv", index = False, header = True)
test_predsB.to_csv("test_predsB.csv", index = False, header = True)

### Creating submission file for single model

In [15]:
sample_submission = pd.read_csv("/kaggle/input/ieee-fraud-detection/sample_submission.csv")
sample_submission['isFraud'] = modelB.predict_proba(df_test)[:,1]   
#sample_submission['isFraud'] = modelB.predict(df_test)
sample_submission.to_csv('simple_RF.csv', index=False)