In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# LOAD DATA

test_id = pd.read_csv('/kaggle/input/test-id-preprocessed/test_identity_preprocessed.csv')  
test_tran = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv') 

train_id = pd.read_csv('/kaggle/input/train-id-preprocessed/train_identity_preprocessed.csv') 
train_tran = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [None]:
train_tran = train_id.merge(train_tran, left_on="TransactionID", right_on="TransactionID", how='right')
del train_id
test_tran = test_id.merge(test_tran, left_on="TransactionID", right_on="TransactionID", how='right')
del test_id

In [None]:
# FILL NA VALUES
train_tran = train_tran.fillna(0)
test_tran = test_tran.fillna(0)

In [None]:
train_tran

In [None]:
# CREATE CATEGORICAL FEATURES
cat_features = ['ProductCD','card4','card6','P_emaildomain','R_emaildomain','M1','M2','M3','M4','M5','M6','M7','M8','M9']

train_tran = pd.get_dummies(train_tran,columns=cat_features)
test_tran = pd.get_dummies(test_tran,columns=cat_features)

# SPLIT DATA
from sklearn.model_selection import train_test_split

y = train_tran['isFraud']
X = train_tran.drop('isFraud',axis=1)                         

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

del train_tran

In [None]:
# X_test_copy = X_test.copy()
# X_train_copy = X_train.copy()
# test_tran_copy = test_tran.copy()

# X_test_copy['D1n'] = np.floor(X_test_copy.TransactionDT / (24*60*60)) - X_test_copy.D1
# X_test_copy['uid'] = X_test_copy.card1.astype(str)+'_'+X_test_copy.addr1.astype(str)+'_'+X_test_copy.D1n.astype(str)

# X_train_copy['D1n'] = np.floor(X_train_copy.TransactionDT / (24*60*60)) - X_train_copy.D1
# X_train_copy['uid'] = X_train_copy.card1.astype(str)+'_'+X_train_copy.addr1.astype(str)+'_'+X_train_copy.D1n.astype(str)

# test_tran_copy['D1n'] = np.floor(test_tran_copy.TransactionDT / (24*60*60)) - test_tran_copy.D1
# test_tran_copy['uid'] = test_tran_copy.card1.astype(str)+'_'+test_tran_copy.addr1.astype(str)+'_'+test_tran_copy.D1n.astype(str)

In [None]:
X_test_copy = X_test.copy()
del X_test
X_train_copy = X_train.copy()
del X_train
test_tran_copy = test_tran.copy()
del test_tran

In [None]:
X_test_copy.drop(['id_30','id_32', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceInfo'],axis=1,inplace=True)
X_train_copy.drop(['id_30', 'id_32', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceInfo'],axis=1,inplace=True)
X.drop(['id_30', 'id_32', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceInfo'],axis=1,inplace=True)
test_tran_copy.drop(['id_31','id_34', 'id_35', 'id_36', 'id_37', 'id_38','DeviceInfo'],axis=1,inplace=True)

for i in range(1,16):
    date_name = "D"+str(i)
    date_name_new = "D"+str(i)+"n"
    X_test_copy[date_name_new] = np.floor(X_test_copy.TransactionDT / (24*60*60)) - X_test_copy[date_name]
    X_train_copy[date_name_new] = np.floor(X_train_copy.TransactionDT / (24*60*60)) - X_train_copy[date_name]
    X[date_name_new] = np.floor(X.TransactionDT / (24*60*60)) - X[date_name]
    test_tran_copy[date_name_new] = np.floor(test_tran_copy.TransactionDT / (24*60*60)) - test_tran_copy[date_name]

In [None]:
######### XGBOOST ###########
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 25)
xg_reg.fit(X_train_copy,y_train)

from sklearn.metrics import roc_auc_score
preds = xg_reg.predict(X_test_copy)

print("XGBoost score: ", roc_auc_score(y_test, preds))

In [None]:
from matplotlib.pyplot import figure
importance = xg_reg.feature_importances_

figure(figsize=(14,12))
pyplot.bar(range(len(xg_reg.feature_importances_)), xg_reg.feature_importances_)
pyplot.xlabel("Features")
pyplot.ylabel("Importance")
pyplot.show()

In [None]:
# get importance
importance = xg_reg.feature_importances_

#Remove Columns that are not important
min_val = 0.0001
X_train_1 = pd.DataFrame(X_train_copy[['TransactionID']])
X_test_1 = pd.DataFrame(X_test_copy[['TransactionID']])
X_ = pd.DataFrame(X[['TransactionID']])
X_TEST = pd.DataFrame(test_tran_copy[['TransactionID']])

cols = X_train_copy.columns
# summarize feature importance
for i,v in enumerate(importance):
    if(v > min_val):
        if(not(cols[i] == 'TransactionID')):
            print("Added feature: ", cols[i])
            X_train_1 = X_train_1.join(X_train_copy.iloc[:,i])
            X_test_1 = X_test_1.join(X_test_copy.iloc[:,i])
            X_ = X_.join(X.iloc[:,i])
            X_TEST = X_TEST.join(test_tran_copy.iloc[:,i])
del X     
del X_train_copy
del X_test_copy
del test_tran_copy

In [None]:
X_train_1

In [None]:
######### XGBOOST ###########
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 25)
xg_reg.fit(X_train_1,y_train)

from sklearn.metrics import roc_auc_score
preds = xg_reg.predict(X_test_1)

print("XGBoost score: ", roc_auc_score(y_test, preds))

In [None]:
import lightgbm as lgb

In [None]:
X_train_2, X_validation, y_train_2, y_validation = train_test_split(X_train_1, y_train, test_size=0.30, random_state=42)

train_data = lgb.Dataset(X_train_2, label=y_train_2)
test_data = lgb.Dataset(X_test_1, label=y_test)
validation_data = lgb.Dataset(X_validation, label=y_validation)

In [None]:
# num_round = 100

# # param = {'num_leaves': 31, 'objective': 'binary'}
# # param['metric'] = 'auc'
# # param['metric'] = ['auc', 'binary_logloss']

# param = {
#     #'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': ['auc', 'binary_logloss'],
#     'num_leaves': 100,
#     #'learning_rate': 0.05,
#     #'feature_fraction': 0.9,
#     #'bagging_fraction': 0.8,
#     #'bagging_freq': 5,
#     'verbose': 0
# }

# bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])
# preds = bst.predict(X_test_1)
# print("LightGBM score: ", roc_auc_score(y_test, preds))

In [None]:
# # CV params
# n_leaves = [30, 50, 100, 200]
# learning_r = [.07, .1, .15]
# f_f = [.5, .7, .9, 1]
# mdil = [20,50,100]

# num_round = 75

# best_n = 0
# best_lr = 0
# best_f = 0
# best_m = 0
# best_AUC = 0

# # param = {'num_leaves': 31, 'objective': 'binary'}
# # param['metric'] = 'auc'
# # param['metric'] = ['auc', 'binary_logloss']
# for n in n_leaves:
#     for lr in learning_r:
#         for f in f_f:
#             for m in mdil:
#                 param = {
#                     'min_data_in_leaf' : m,
#                     'boosting_type': 'gbdt',
#                     'objective': 'binary',
#                     'metric': ['auc'],
#                     'num_leaves': n,
#                     'learning_rate': lr,
#                     'feature_fraction': f,
#                     'verbose': 0
#                 }

#                 bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data],verbose_eval = -1)
#                 preds = bst.predict(X_test_1)
#                 auc = roc_auc_score(y_test, preds)
#                 print("LightGBM score: ", auc)
#                 if auc > best_AUC:
#                     print("                                                                             NEW BEST AUC: ",auc,"!!")
#                     best_n = n
#                     best_lr = lr
#                     best_f = f
#                     best_m = m
#                     best_AUC = auc

In [None]:
# print("N: ",best_n)
# print("LR: ",best_lr)
# print("F: ",best_f)
# print("M: ",best_m)
# print("AUC: ",best_AUC)


In [None]:
# #default parameters
# num_round = 100

# param = {
#                     #'min_data_in_leaf' : 20,
#                     #'boosting_type': 'gbdt',
#                     'objective': 'binary',
#                     #'metric': ['auc'],
#                     #'num_leaves': 200,
#                     #'learning_rate': .15,
#                     #'feature_fraction': .9,
#                     'verbose': 0
#                 }

# bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data],verbose_eval = -1)
# preds = bst.predict(X_test_1)
# auc = roc_auc_score(y_test, preds)
# print("LightGBM score: ", auc)

In [None]:
num_round = 100

param = {
                    'min_data_in_leaf' : 20,
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': ['auc'],
                    'num_leaves': 200,
                    'learning_rate': .15,
                    'feature_fraction': .9,
                    'verbose': 0
                }

bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data],verbose_eval = -1)
preds = bst.predict(X_test_1)
auc = roc_auc_score(y_test, preds)
print("LightGBM score: ", auc)

In [None]:
preds = bst.predict(X_)
Y = pd.DataFrame(preds,columns=["isFraud"])
Y.insert(0, 'TransactionID', X_['TransactionID'])
Y.to_csv('LightGBM_Train.csv',index=False)

In [None]:
preds = bst.predict(X_TEST)
Y_TEST = pd.DataFrame(preds,columns=["isFraud"])
Y_TEST.insert(0, 'TransactionID', X_TEST['TransactionID'])
Y_TEST.to_csv('LightGBM_Test.csv',index=False)