Previous kernel - https://www.kaggle.com/priteshshrivastava/ieee-pipeline-1-create-validation-set

Input - Train & val, test CSVs

Output - Val & Test preds

Next kernel - Meta model https://www.kaggle.com/priteshshrivastava/ieee-pipeline-3-stacking-with-meta-model

This one is based on Inversion's simple xgb kernel : https://www.kaggle.com/inversion/ieee-simple-xgboost/output

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
from sklearn.metrics import roc_auc_score
import pickle
from sklearn import preprocessing
import xgboost as xgb

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/__output__.json
/kaggle/input/ieee-pipeline-1-create-validation-set/test_df.pkl
/kaggle/input/ieee-pipeline-1-create-validation-set/val_X.pkl
/kaggle/input/ieee-pipeline-1-create-validation-set/__notebook__.ipynb
/kaggle/input/ieee-pipeline-1-create-validation-set/train_y.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/__results__.html
/kaggle/input/ieee-pipeline-1-create-validation-set/custom.css
/kaggle/input/ieee-pipeline-1-create-validation-set/val_y.csv
/kaggle/input/ieee-pipeline-1-create-validation-set/train_X.pkl


In [2]:
train_X = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/train_X.pkl")
train_y = pd.read_csv("/kaggle/input/ieee-pipeline-1-create-validation-set/train_y.csv")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
train_y.head()

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0


In [4]:
val_X = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/val_X.pkl")
val_y = pd.read_csv("/kaggle/input/ieee-pipeline-1-create-validation-set/val_y.csv")

In [5]:
test_df = pd.read_pickle("/kaggle/input/ieee-pipeline-1-create-validation-set/test_df.pkl")
test_df.head()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.950001,W,10409,111,150,visa,226,debit,170,...,,7,,,,,,,,
3663550,18403263,49.0,W,4272,111,150,visa,226,debit,299,...,,7,,,,,,,,
3663551,18403310,171.0,W,4476,574,150,visa,226,debit,472,...,,7,,,,,,,,
3663552,18403310,284.950012,W,10989,360,150,visa,166,debit,205,...,,7,,,,,,,,
3663553,18403317,67.949997,W,18018,452,150,mastercard,117,debit,264,...,,7,,,,,,,,


### Handling missing values & categorical variables

In [6]:
train_X = train_X.fillna(-999)
val_X = val_X.fillna(-999)
test_df = test_df.fillna(-999)

In [7]:
# Label Encoding
for f in train_X.columns:
    if train_X[f].dtype=='object' or val_X[f].dtype=='object' or test_df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_X[f].values) + list(val_X[f].values) + list(test_df[f].values))
        train_X[f] = lbl.transform(list(train_X[f].values))
        val_X[f] = lbl.transform(list(val_X[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

### Defining function to calculate the evaluation metric

In [8]:
def auc(x,y): 
    return roc_auc_score(x,y)
def print_score(m):
    #res = [auc(m.predict_proba(train_X)[:,1], train_y), auc(m.predict_proba(val_X)[:,1], val_y)]  ## continuous not supported
    res = [auc(m.predict(train_X), train_y), auc(m.predict(val_X), val_y)]
    print(res)

In [9]:
modelC = xgb.XGBClassifier(n_estimators=500,
                        n_jobs=4,
                        max_depth=9,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        missing=-999)

In [10]:
modelC.fit(train_X, train_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=-999, n_estimators=500, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)

In [11]:
print_score(modelC)

[0.9958716069016625, 0.9586296308469887]


### Make predictions on validation AND test set

In [12]:
predsC = pd.Series(modelC.predict_proba(val_X)[:,1])

In [13]:
test_predsC = pd.Series(modelC.predict_proba(test_df)[:,1])

### Storing val & test pred

In [14]:
predsC.to_csv("predsC.csv", index = False, header = True)
test_predsC.to_csv("test_predsC.csv", index = False, header = True)

### Creating a submission file for the single model

In [15]:
sample_submission = pd.read_csv("/kaggle/input/ieee-fraud-detection/sample_submission.csv")
sample_submission['isFraud'] = modelC.predict_proba(test_df)[:,1]
sample_submission.to_csv('simple_xgboost.csv', index=False)