## Overview

Vesta Corporation provided the dataset for this competition. Vesta Corporation is the forerunner in guaranteed e-commerce payment solutions. Founded in 1995.

In this competition, the aim is to benchmark machine learning models on a challenging large-scale dataset.

The data comes from Vesta's real-world e-commerce transactions and contains a wide range of features from device type to product features.
The machine learning model will alert the fraudulent transaction for millions of people around the world, helping hundreds of thousands of businesses reduce their fraud loss and increase their revenue.

The training dataset consists of more than 400 features and 5.9 Million samples. This is supervised binary classification problem and goal is to predict if a credit card transaction is Fraud based on input features mentioned below
Evaluation

The model is evaluated on ROC_AUC score. The notebook will produce an output csv file with TransactionID and predicted probabilties on test set, which will be automatically evaluted by Kaggle.


 #### 1. Data preparation

#### 2. Selecting Numerical Features

####  3. Selecting Categorical Features

#### 4. Evaluate the Categorical Features

#### 5. Data Preprocessing 

#### 6. Logistic Regression Model

#### 7. Decision Tree Model

#### 8. Random Forest Model

#### 9. Gradient Boosted Trees

#### 10. Model selection 

#### 11. Load Test Dataset

#### 12. Generate Test Prediction



## Import Packages 

In [None]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency

## Load the Train Data 

In [None]:
train_id = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
train_trans = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")

## Merge Data 

In [None]:
train = train_trans.merge(train_id, how='left', on='TransactionID')

print(train.shape)

print(f'{train.memory_usage(index=True, deep=True).sum():,}')

In [None]:
del train_id
del train_trans
gc.collect()

In [None]:
train = train.sample(frac=0.1, random_state=1)

## Selecting Numerical FeaturesÂ¶

In [None]:
num_features = ['TransactionDT', 'TransactionAmt', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 
                'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 
                'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V2', 'V3', 'V4', 'V5', 'V6', 
                'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
                'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 
                'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 
                'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 
                'V60', 'V61', 'V62', 'V63', 'V64', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 
                'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 
                'V87', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 
                'V101', 'V102', 'V103', 'V104', 'V105', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 
                'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 
                'V126', 'V127', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V138', 'V139', 'V140', 'V141', 
                'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 
                'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 
                'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V176', 'V177', 'V178', 
                'V179', 'V180', 'V181', 'V182', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 
                'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 
                'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V211', 'V212', 'V213', 'V214', 'V215', 'V217', 
                'V218', 'V219', 'V220', 'V221', 'V222', 'V225', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 
                'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V242', 'V243', 'V244', 'V245', 'V246', 
                'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 
                'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 
                'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V281', 'V282', 'V283', 
                'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V296', 
                'V300', 'V301', 'V302', 'V303', 'V304', 'V307', 'V308', 'V309', 'V310', 'V312', 'V313', 'V314', 
                'V315', 'V316', 'V317', 'V320', 'V322', 'V323', 'V324', 'V325', 'V326', 'V328', 'V329', 'V330', 
                'V331', 'V332', 'V333', 'V338', 'V339', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 
                'id_07', 'id_08', 'id_09', 'id_10', 'id_11']

## Selecting Categorical Features

In [None]:
cat_features = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 
                'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 
                'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 
                'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 
                'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 
                'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

len(cat_features)

### Evaluate the Categorical Features

In [None]:
y_train = train.isFraud.values

In [None]:
train_cat = train.loc[:,cat_features]
train_cat.head()

In [None]:
def eval_cat_feature(feature):

    # Remove Missing
    sel = ~train_cat.loc[:,feature].isna().values
    column = train_cat.loc[:,[feature]].dropna().astype('str')
    y_temp = y_train[sel]

    # Perform One-Hot Encoding of the Feature
    encoder = OneHotEncoder(sparse=False)
    encoder = encoder.fit(column)
    enc_df = encoder.transform(column)

    stats = []
    pval = []
    for i, c in enumerate(encoder.categories_[0]):
        ct1 = pd.crosstab(enc_df[:,i], y_temp)                                 
        stat, p, dof, expected = chi2_contingency(ct1, correction=True)
        stats.append(stat)
        pval.append(p)

    ct2 = pd.crosstab(column.iloc[:,0], y_temp)
    ct2['n'] = column.value_counts().sort_index().tolist()
    ct2['propFraud'] = ct2[1] / ct2['n']
    ct2['chi2'] = stats
    ct2['p'] = pval
    ct2 = ct2.sort_values('chi2', ascending=False)

    return ct2

In [None]:

P_emaildomain= eval_cat_feature('P_emaildomain')
P_emaildomain.head(5)

In [None]:
q = P_emaildomain.query('p < 0.05').query('n > 1000')
q

In [None]:
print(len(q))
print(q.index.values.tolist())

In [None]:
del P_emaildomain
gc.collect()

In [None]:
# ProductCD

keep = ['C', 'W', 'S', 'H', 'R']
train['ProductCD'] = train.ProductCD.astype(str).apply(lambda x : x if x in keep else 'NA' )

#card1

keep = ['9633', '9026', '2256', '8755', '15063', '5812', '7919', '16062', '13832', '9002', 
        '2616', '4461', '11201', '3154', '2803', '6530', '12839', '15497', '10112', '16560', 
        '11207', '12577', '8320', '18132', '2884', '9803', '14858', '7676', '1974', '16132', 
        '4436', '12544', '12686', '18249', '10486', '4806', '10989', '6550', '7585', '17399', 
        '18018', '15651', '9112', '15775', '17131', '6170', '4663', '7815', '9992', '2392', 
        '13481', '10023', '17188', '16075', '11157', '17055', '15885', '13249', '10447', '7861', 
        '12501', '8528', '2772', '16255', '6019', '15484', '16659', '1444', '15986', '8406', 
        '12469', '2455', '17400', '9480', '12695', '14290', '1939', '7664', '9175', '7826', '1342', 
        '7508', '7207', '11106', '1893', '9300', '15066']
train['card1'] = train.card1.astype(str).apply(lambda x : x if x in keep else 'NA')

#card2

keep = ['130.0', '500.0', '142.0', '177.0', '296.0', '545.0', '408.0', '327.0', '103.0', '194.0', '375.0', 
        '360.0', '111.0', '204.0', '555.0', '490.0', '268.0', '476.0', '548.0', '543.0', '100.0', '206.0', 
        '215.0', '404.0', '361.0', '453.0', '512.0', '567.0', '321.0', '514.0', '455.0', '369.0', '553.0', 
        '558.0', '302.0', '390.0', '517.0', '122.0', '452.0', '199.0', '174.0', '583.0', '161.0', '513.0', 
        '418.0', '494.0', '181.0', '532.0', '470.0', '250.0', '298.0', '225.0', '310.0', '276.0', '393.0', 
        '383.0', '399.0', '264.0', '170.0', '127.0', '481.0', '158.0', '343.0']
train['card2'] = train.card2.astype(str).apply(lambda x : x if x in keep else 'NA')

#card3

keep = ['185.0', '150.0', '144.0', '106.0', '146.0']
train['card3'] = train.card3.apply(lambda x : x if x in keep else 'NA')

#card4

keep = ['discover', 'american express']
train['card4'] = train.card4.apply(lambda x : x if x in keep else 'NA')


#card5

keep = ['137.0', '138.0', '166.0', '102.0', '226.0', '117.0', '126.0', '219.0', '202.0', '162.0', 
        '224.0', '229.0', '197.0', '195.0', '150.0']
train['card5'] = train.card5.astype(str).apply(lambda x : x if x in keep else 'NA')


#card6

keep = ['credit', 'debit']
train['card6'] = train.card6.apply(lambda x : x if x in keep else 'NA')

#addr1

keep = ['122.0', '308.0', '184.0', '264.0', '324.0', '330.0', '143.0', '494.0', '315.0', '205.0', '191.0', 
        '226.0', '203.0', '512.0', '269.0', '299.0', '444.0', '476.0', '436.0', '272.0', '337.0', '194.0', 
        '387.0', '498.0', '420.0', '327.0', '310.0', '204.0', '170.0', '485.0', '177.0', '433.0', '181.0', '231.0']
train['addr1'] = train.addr1.astype(str).apply(lambda x : x if x in keep else 'NA')


#addr2

keep = ['87.0', '60.0']
train['addr2'] = train.addr2.apply(lambda x : x if x in keep else 'NA')

# P_emaildomain

keep = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'anonymous.com', 
        'aol.com', 'att.net', 'sbcglobal.net', 'verizon.net', 'yahoo.com.mx', 'msn.com',
        'ymail.com', 'me.com', 'optonline.net', 'cox.net', 'live.com', 'comcast.net']
train['P_emaildomain'] = train.P_emaildomain.apply(lambda x : x if x in keep else 'NA')

In [None]:
# R_emaildomain

keep = ['gmail.com', 'anonymous.com', 'outlook.com', 'yahoo.com', 'comcast.net', 'aol.com', 'yahoo.com.mx', 'icloud.com', 'hotmail.com']
train['R_emaildomain'] = train.R_emaildomain.apply(lambda x : x if x in keep else 'NA')

# M1
keep = ['0']
train['M1'] = train.M1.apply(lambda x : x if x in keep else 'NA')

# M2
keep = ['F', 'T']
train['M2'] = train.M2.apply(lambda x : x if x in keep else 'NA')

# M3
keep = ['F', 'T']
train['M3'] = train.M3.apply(lambda x : x if x in keep else 'NA')

# M4

keep = ['M2', 'M0', 'M1']
train['M4'] = train.M4.apply(lambda x : x if x in keep else 'NA')

# M5

keep = ['F', 'T']
train['M5'] = train.M5.apply(lambda x : x if x in keep else 'NA')

# M6

keep = ['F', 'T']
train['M6'] = train.M6.apply(lambda x : x if x in keep else 'NA')

# M7

keep = ['F', 'T']
train['M7'] = train.M7.apply(lambda x : x if x in keep else 'NA')

# M8

keep = ['F', 'T']
train['M8'] = train.M8.apply(lambda x : x if x in keep else 'NA')

# M9

keep = ['F', 'T']
train['M9'] = train.M9.apply(lambda x : x if x in keep else 'NA')

In [None]:
# id_12
keep = ['Found', 'NotFound']
train['id_12'] = train.id_12.apply(lambda x : x if x in keep else 'NA')

# id_13
keep = ['33.0', '19.0', '27.0', '14.0', '49.0', '52.0', '64.0', '63.0', '25.0']
train['id_13'] = train.id_13.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_14

keep = ['-420.0', '-480.0', '-300.0']
train['id_14'] = train.id_14.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_15

keep = ['New', 'Found', 'Unknown']
train['id_15'] = train.id_15.apply(lambda x : x if x in keep else 'NA')

# id_16

keep = ['Found', 'NotFound']
train['id_16'] = train.id_16.apply(lambda x : x if x in keep else 'NA')

# id_17

keep = ['225.0', '166.0']
train['id_17'] = train.id_17.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_18
keep = ['15.0', '13.0', '12.0']
train['id_18'] = train.id_18.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_19

keep = ['384.0', '266.0', '321.0', '410.0', '567.0', '529.0', '176.0', '312.0', '100.0', 
        '153.0', '215.0', '254.0', '633.0', '427.0', '548.0', '290.0', '492.0', '341.0', 
        '621.0', '193.0', '271.0', '352.0', '417.0', '216.0', '542.0', '390.0']
train['id_19'] = train.id_19.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_20

keep = ['325.0', '507.0', '561.0', '222.0', '595.0', '214.0', '612.0', '533.0', '368.0', '391.0', '500.0',
        '611.0', '600.0', '177.0', '489.0', '225.0', '563.0', '280.0', '484.0', '266.0', '146.0', '333.0', 
        '535.0', '277.0', '161.0', '566.0', '401.0', '549.0', '315.0', '127.0', '256.0']
train['id_20'] = train.id_20.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_21

keep = ['0']
train['id_21'] = train.id_21.apply(lambda x : x if x in keep else 'NA')

In [None]:
# id_22

keep = ['14.0']
train['id_22'] = train.id_22.apply(lambda x : x if x in keep else 'NA')

# id_23

keep = ['IP_PROXY:ANONYMOUS', 'IP_PROXY:TRANSPARENT']
train['id_23'] = train.id_23.apply(lambda x : x if x in keep else 'NA')

# id_24

keep = ['15.0', '11.0']
train['id_24'] = train.id_24.astype(str).apply(lambda x : x if x in keep else 'NA')

# id_25

keep = ['0']
train['id_25'] = train.id_25.apply(lambda x : x if x in keep else 'NA')

# id_26

keep = ['0']
train['id_26'] = train.id_26.apply(lambda x : x if x in keep else 'NA')

# id_27

keep = ['0']
train['id_27'] = train.id_27.apply(lambda x : x if x in keep else 'NA')

# id_28

keep = ['Found', 'New']
train['id_28'] = train.id_28.apply(lambda x : x if x in keep else 'NA')

# id_29

keep = ['Found', 'NotFound']
train['id_29'] = train.id_29.apply(lambda x : x if x in keep else 'NA')

# id_30

keep = ['iOS 11.2.5', 'Windows 10', 'Android', 'Mac OS X 10_11_6', 'Mac OS X 10_12_6', 'Windows 7', 
        'Windows 8.1', 'Mac OS X 10_13_2', 'iOS 11.3.0', 'Linux', 'Mac OS X 10_13_1', 'Mac OS X 10_10_5', 
        'Android 7.0', 'iOS 11.1.2', 'iOS 11.2.6', 'iOS 10.3.3']
train['id_30'] = train.id_30.apply(lambda x : x if x in keep else 'NA')

# id_31

keep = ['chrome generic', 'ie 11.0 for desktop', 'safari generic', 'chrome 64.0 for android', 'chrome 65.0 for android',
        'edge 16.0', 'chrome 62.0', 'chrome 63.0 for android', 'chrome 66.0 for android', 'mobile safari generic', 
        'mobile safari 11.0', 'chrome 63.0', 'mobile safari 10.0', 'edge 15.0', 'chrome 65.0', 'firefox 57.0', 
        'chrome generic for android', 'chrome 64.0', 'chrome 66.0']
train['id_31'] = train.id_31.apply(lambda x : x if x in keep else 'NA')

In [None]:
# id_32

keep = ['24.0', '32.0']
train['id_32'] = train.id_32.apply(lambda x : x if x in keep else 'NA')

# id_33

keep = ['2208x1242', '1280x720', '2436x1125', '1920x1080', '2560x1440', '2048x1536', '1680x1050', '2880x1800', 
        '2560x1600', '1440x900', '1366x768', '1334x750', '1600x900', '1920x1200', '1280x1024', '2001x1125']
train['id_33'] = train.id_33.apply(lambda x : x if x in keep else 'NA')

# id_34

keep = ['match_status:1', 'match_status:2']
train['id_34'] = train.id_34.apply(lambda x : x if x in keep else 'NA')

# id_35

keep = ['F', 'T']
train['id_35'] = train.id_35.apply(lambda x : x if x in keep else 'NA')

# id_36

keep = ['F', 'T']
train['id_36'] = train.id_36.apply(lambda x : x if x in keep else 'NA')

# id_37

keep = ['F', 'T']
train['id_37'] = train.id_37.apply(lambda x : x if x in keep else 'NA')

# id_38

keep = ['F', 'T']
train['id_38'] = train.id_38.apply(lambda x : x if x in keep else 'NA')

# DeviceType

keep = ['desktop', 'mobile']
train['DeviceType'] = train.DeviceType.apply(lambda x : x if x in keep else 'NA')

# DeviceInfo

keep = ['MacOS', 'Trident/7.0', 'Windows', 'iOS Device', 'rv:11.0', 'rv:57.0', 'SM-J700M Build/MMB29K']
train['DeviceInfo'] = train.DeviceInfo.apply(lambda x : x if x in keep else 'NA')

## Data Preprocessing 

In [None]:
features = num_features + cat_features

print(len(cat_features))
print(len(num_features))

In [None]:
num_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ])

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [None]:
%%time 

preprocessor.fit(train[features])

X_train = preprocessor.transform(train[features])

y_train = train.isFraud.values

print(X_train.shape)
print(f'{X_train.size * X_train.itemsize:,}')

print('y_train shape:', y_train.shape)

In [None]:
del train
gc.collect()

## Logistic Regression Model

model = LogisticRegression(solver='saga')

parameters = {'C' : [5, 25 , 50, 200]}

lr_grid = GridSearchCV(model, parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
lr_grid.fit(np.asfortranarray(X_train), y_train)

* {'C': 5} 0.86233991 
* {'C': 25} 0.86105991
* {'C': 50} 0.86072292
* {'C': 200} 0.86012576

In [None]:
lr_mod = LogisticRegression(solver='liblinear',C=10)
lr_mod.fit(X_train, y_train)
print(lr_mod.score(X_train, y_train))

## Decision Tree Model

%%time

dtree = DecisionTreeClassifier(random_state=1)
parameters = {
     'max_depth': [16,24,6,10,28], 
}

dt_grid = GridSearchCV(dtree, parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
dt_grid.fit(np.asfortranarray(X_train), y_train)

* {'max_depth': 16}    0.74893428
* {'max_depth': 24}    0.69960432
* {'max_depth': 6}     0.77110391
* {'max_depth': 10}    0.78623385
* {'max_depth': 28}    0.67791525


In [None]:
 %%time
Dtree = DecisionTreeClassifier(min_samples_leaf=4)
Dtree.fit(X_train, y_train)
print(Dtree.score(X_train, y_train))

## Random Forest Model

 %%time
    
forest = RandomForestClassifier(random_state=1, n_estimators=100)
parameters = {
    'max_depth': [14,18,22,24,26],
}

forest_grid = GridSearchCV(forest, parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
forest_grid.fit(np.asfortranarray(X_train), y_train)


* {'max_depth': 14}    0.88508168
* {'max_depth': 18}    0.89350588
* {'max_depth': 22}    0.90002487
* {'max_depth': 24}    0.89710636
* {'max_depth': 26}    0.89957935


In [None]:
 %%time
forest = RandomForestClassifier(random_state=1, n_estimators=100)
forest.fit(X_train, y_train)
print(forest.score(X_train, y_train))

## GRADIENT BOOSTED TREES

%%time 

model = XGBClassifier(n_estimators=200, max_depth=3)
parameters = {
    'learning_rate' : [0.4,0.25,0.3,0.1,0.5]
}

grid = GridSearchCV(model, parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')

grid.fit(X_train, y_train)

* {'learning_rate': 0.4}   0.90593755
* {'learning_rate': 0.25}  0.90493450
* {'learning_rate': 0.3}   0.90564158
* {'learning_rate': 0.1}   0.89987809
* {'learning_rate': 0.5}   0.90127143

In [None]:
 %%time
model = XGBClassifier(n_estimators=200)   
model.fit(X_train, y_train)
print(model.score(X_train, y_train))

## Load test Data 

In [None]:
test_id = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
test_trans = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")

In [None]:
test = test_trans.merge(test_id, how='left', on='TransactionID' )

In [None]:
test = test.rename(columns={"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})
test.head()

In [None]:
del test_id
del test_trans

In [None]:
X_test = preprocessor.transform(test[features])

## Generate Test Predictions

In [None]:
CSI_submission = pd.read_csv("../input/ieee-fraud-detection/sample_submission.csv")
CSI_submission.head()

In [None]:
test_pred = model.predict_proba(X_test)
print(test_pred[:20])

In [None]:
submission = pd.DataFrame({
    'TransactionID' : test.TransactionID,
    'isFraud' : test_pred[:,1]
})
submission.head()

In [None]:
submission.to_csv('my_submission.csv', index=False)