# Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## look into the dater
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
#!pip install dtreeviz 

from IPython.display import Image, display_svg, SVG

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, plot_confusion_matrix, precision_score, recall_score, classification_report, plot_roc_curve, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GroupKFold

In [None]:
import gc
import random as r
import joblib

In [None]:
## pandas
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
main_path = "../input/fraud-detection-categorified-and-split/"

## Load data

In [None]:
## Choosing the type simulations
buildLv = False ## Hold out validation
buildSkf = True ## stratified k fold 

AV=True ## Adeverserial Validation

In [None]:
%%time
## Load pre-processed tabular pandas (fastai) test and train
dset_str="all"#all#50k#10k

to = load_pickle(main_path+"to_"+dset_str+"c.pkl") 
to_tst = load_pickle(main_path+"to_tst_"+dset_str+"c.pkl")

## split as Xs and Ys
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

test_xs = to_tst.xs.copy()
xs.shape, test_xs.shape

In [None]:
## delete large unused variables
del to, to_tst; x=gc.collect()

## Functions

In [None]:
## Function to easily get the metrics in the format for saving
def m_rep(m,xs,y): return classification_report(y, m.predict(xs), labels=[1,0], digits=4, output_dict=True)

def metrics(m, xs, y , valid_xs, valid_y): 
    tr_rep = m_rep(m,xs,y)
    vd_rep = m_rep(m,valid_xs,valid_y)
    tr_auc = roc_auc_score(y,m.predict_proba(xs)[:,1])
    oob_auc=0.0000
    #oob_auc = roc_auc_score(y,m.oob_decision_function_[:,1])
    vd_auc = roc_auc_score(valid_y,m.predict_proba(valid_xs)[:,1])
    print('{:.4f} ; {:.4f} ; {:.4f}'
          .format(tr_auc, oob_auc, vd_auc))

In [None]:
## Generate table of important features from the model with other useful information
def xgb_fi(m, df,df_real=None):
    fi = pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False).reset_index(drop=True)
    #fi["isCont"] = fi.cols.isin(cont)
    fi["countNA"] = [df.loc[:,col].isna().sum()/len(df) if col in df.columns else float("NaN") for col in fi.cols]
    return fi

def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
## Format Test Predictions for Submission
def format_test_preds(preds,test_xs,comment):
    #preds = m.predict_proba(test_xs[cols])[:,1]
    print(pd.DataFrame(preds).describe())
    
    ## Make into submission dataframe
    df_pred = pd.DataFrame({"TransactionID":test_xs.TransactionID.to_list(),
                            "isFraud": preds})
    
    ## save
    df_pred.to_csv(comment+"my_subs.csv",index=False)
    print(df_pred.shape)
    del df_pred; x=gc.collect()

In [None]:
def xgb_lvd_fun(get_feat_imp):
    ## model
    clf = xgb.XGBClassifier(n_estimators=2000, 
                        max_depth=8, #12
                        learning_rate= 0.05,
                        subsample= 0.6,#0.8 
                        colsample_bytree= 0.4, 
                        random_state = r.randint(0,9999),
                        use_label_encoder=False,
                        # USE GPU
                        tree_method='gpu_hist')

    m = clf.fit(xs[cols].iloc[idxT], y.iloc[idxT], eval_set=[(xs[cols].iloc[idxV],y.iloc[idxV])],
                eval_metric= "auc", verbose=100, early_stopping_rounds=100)
    ## score prediction
    tr_auc = roc_auc_score(y.iloc[idxT], m.predict_proba(xs[cols].iloc[idxT])[:,1])
    vd_auc = roc_auc_score(y.iloc[idxV],m.predict_proba(xs[cols].iloc[idxV])[:,1])
    
    ## test results predictions
    te_pred = m.predict_proba(test_xs[cols])[:,1]
    
    ## Getting feature importances
    if get_feat_imp:
        fi = xgb_fi(m, xs[cols])
        fi.to_csv("fi_lv.csv",index=False)
        print(fi)
    
    ## remove large files    
    del m, clf; x=gc.collect()
    return [tr_auc, vd_auc], te_pred
    

In [None]:
def xgb_pred_fun():
    
    ## setup kfold
    skf = KFold(n_splits = 5, shuffle = False)
    tr_pred = np.zeros(len(xs))
    oof_pred = np.zeros(len(xs))
    te_pred = np.zeros(len(test_xs))

    for i, (idxT, idxV) in enumerate(skf.split(xs,y)):
        ## model
        print('Fold',i)
        print(' n_rows of train =',len(idxT),'rows of holdout =',min(idxV),"to",max(idxV))#len(idxV))        
        clf = xgb.XGBClassifier(n_estimators=1000, 
                        max_depth=12, #12
                        learning_rate= 0.02,
                        subsample= 0.8,#0.8 
                        colsample_bytree= 0.4, 
                        random_state = r.randint(0,9999),
                        use_label_encoder=False,
                        # USE GPU
                        tree_method='gpu_hist')

        m = clf.fit(xs[cols].iloc[idxT], y.iloc[idxT], eval_set=[(xs[cols].iloc[idxV],y.iloc[idxV])],
                eval_metric= "auc", verbose=100, early_stopping_rounds=100)
        
        ## predicting the probabilities of for Train OOF and Test
        tr_pred[idxT] += m.predict_proba(xs[cols].iloc[idxT])[:,1]/(skf.n_splits-1)
        oof_pred[idxV] += m.predict_proba(xs[cols].iloc[idxV])[:,1]
        te_pred += m.predict_proba(test_xs[cols])[:,1]/skf.n_splits
        
        ## Getting Feature Importances
        if i==4:
            fi = xgb_fi(m, xs[cols])
            fi.to_csv("fi_lv.csv",index=False)
            print(fi)
            
        ## remove large files    
        del m; x=gc.collect()

    print('{:.4f} ; {:.4f} ; {:.4f}'
              .format(roc_auc_score(y,tr_pred),roc_auc_score(y,oof_pred),0.000))
    
    return te_pred

## Column selection from EDA.

In [None]:
## collecting all columns
cols = xs.columns.to_list()

In [None]:
## Vcol buckets (first filter based on correlation) From fraud detection v11's EDA 
v = []
v += ['V1', 'V3', 'V11', 'V9', 'V5', 'V7']
v += ['V13', 'V17', 'V24', 'V14', 'V20', 'V27', 'V34', 'V26', 'V30']
v += ['V36', 'V37', 'V47', 'V40', 'V48', 'V52', 'V41', 'V45']
v += ['V54', 'V65', 'V60', 'V67', 'V56', 'V68', 'V62', 'V55', 'V70']
v += ['V76', 'V89', 'V91', 'V81', 'V82', 'V87', 'V78', 'V88']
v += ['V127', 'V121', 'V99', 'V110', 'V104', 'V130', 'V129', 'V131', 'V109', 'V136', 'V116', 'V120', 'V125', 'V113', 'V118', 'V98', 'V107', 'V117', 'V115']
v += ['V138', 'V140', 'V142', 'V147', 'V155', 'V162']
v += ['V165', 'V160', 'V166']
v += ['V203', 'V207', 'V216', 'V187', 'V176', 'V173', 'V183', 'V215']
v += ['V169', 'V195', 'V201', 'V171', 'V174', 'V175', 'V209', 'V185', 'V188', 'V210', 'V198', 'V180']
v += ['V274', 'V264', 'V265', 'V261', 'V235', 'V223', 'V258', 'V260', 'V246', 'V252', 'V241', 'V266', 'V240', 'V277', 'V228', 'V226']
v += ['V220', 'V239', 'V271', 'V221', 'V234', 'V251']
v += ['V307', 'V291', 'V285', 'V290', 'V312', 'V297', 'V305', 'V320', 'V303', 'V286', 'V309', 'V284', 'V310']
v += ['V281', 'V301', 'V282', 'V315', 'V289', 'V296', 'V314', 'V283']
v += ['V332', 'V338', 'V337', 'V336', 'V325', 'V326', 'V328', 'V335']
len(v)

def set_approach(a,b):
    return list(set(a)-set(b))

## Remove all v columns using below code
tf_V = [bool(re.search("^V"+"[0-9]+",col)) for col in xs.columns.to_list()]
cols = xs.columns[list(~np.array(tf_V))].to_list()

In [None]:
## add specific v columns
cols +=v
print(len(cols))

In [None]:
## remove time and other columns that don't add to score
cols_rem = ['TransactionID', 'TransactionDT']

cols_rem +=["DayNum","HrOfDay","WkDayNum",'R_emaildomain1', 'R_emaildomain2', 'P_emaildomain1', 'P_emaildomain2','id_31_browser',"DeviceInfo_make"]
cols = set_approach(cols,cols_rem)

In [None]:
len(cols)

## Clean up D columns

In [None]:
## function to one hot encode
def ohe(df,cols):
    for col in cols:
        df[col+"_OHE"] = df[col].isna().astype(int)

In [None]:
## OHE of cols_ohe
cols_ohe = ["D2","D9"] 

ohe(xs,cols_ohe)
ohe(test_xs,cols_ohe)
cols += ["D2_OHE","D9_OHE"]

In [None]:
cols_rem =["D2","D9","D12"]
cols = set_approach(cols,cols_rem)

In [None]:
len(cols)

## Adding Dn columns

In [None]:
cols += ["DayNum","HrOfDay","WkDayNum"]

In [None]:
def Dn(df,cols):
    for col in cols:
        df[col+"n"] = df["DayNum"] - df[col]

In [None]:
d_cols = ['D1',"D2", 'D3', 'D4', 'D5', 'D6', 'D7', "D8", "D9", 'D10', 'D11', "D12",
          'D13', 'D14', 'D15']
Dn(xs,d_cols)
Dn(test_xs,d_cols)
d_cols = [string +"n" for string in d_cols]
cols +=d_cols

In [None]:
len(cols)

In [None]:
print(cols)

## Combine and group aggregation funtions

In [None]:
def encode_CB2(df1,df2,uid):
    newcol = "_".join(uid)
    ## make combined column
    df1[newcol] = df1[uid].astype(str).apply(lambda x: '_'.join(x), axis=1)
    df2[newcol] = df2[uid].astype(str).apply(lambda x: '_'.join(x), axis=1)
    
    ## concat and then factorize
    temp_df = pd.concat([df1[newcol],df2[newcol]],axis=0)
    temp_df,_ = temp_df.factorize(sort=True)
    
    ## unconcat    
    if temp_df.max()>32000: 
        df1[newcol+"_fact"] = temp_df[:len(df1)].astype('int32')
        df2[newcol+"_fact"] = temp_df[len(df1):].astype('int32')
    else:
        df1[newcol+"_fact"] = temp_df[:len(df1)].astype('int16')
        df2[newcol+"_fact"] = temp_df[len(df1):].astype('int16')
    print(newcol+"_fact")
    return [newcol+"_fact"]

In [None]:
## Aggregations 
def encode_ag(df,df_te,uid,cols_ag,func_list,ag=True):
    ## concat test and train
    new_cols_ret = []
    ## 1. concat test and train
    temp_df = pd.concat([df[uid+cols_ag],df_te[uid+cols_ag]],axis=0).reset_index(drop=True)
    temp_df[uid] = temp_df[uid].fillna(-9999)
    ## 2. group them by UID
    grouped = temp_df.groupby(uid)
    for func in func_list:
        ## 3. Create new features based on "func"
        temp2_df = grouped[cols_ag].transform(func).reset_index(drop=True)#.iloc[:,0]
        if ag:
            new_cols = [col+"_"+"_".join(uid)+"_"+func+"2" for col in cols_ag]
        else:
            new_cols = ["_".join(uid)+"_FE2" for col in cols_ag]
        new_cols_ret +=new_cols
        ## 4. Save functions
        df[new_cols] = temp2_df[0:len(df)].fillna(-9999).astype('float32')
        df_te[new_cols] = temp2_df[len(df):].fillna(-9999).reset_index(drop=True).astype('float32')
    print(new_cols_ret)
    return new_cols_ret

## Group aggregation 

In [None]:
## UID definition
uid = ["D1n","card1","addr1","P_emaildomain"] ## uid1
#uid = ["D1n","card1","addr1","P_emaildomain","D3n","V1","M7"]
print(xs[uid].isna().sum())


In [None]:
len(cols)

## My aggregations

In [None]:
# ## Combine
cols+= encode_CB2(xs,test_xs,["card1","addr1"])
cols+= encode_CB2(xs,test_xs,uid)

## Frequency Encoding (FE)
cols += encode_ag(xs,test_xs,["addr1"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["card1"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["card2"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["dist1"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["card1","addr1"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,uid,["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["card1","D1n"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["P_emaildomain"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["TransactionAmt"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["cents"],["TransactionID"],["count"],ag=False)
## Additional FE
cols += encode_ag(xs,test_xs,["ProductCD"],["TransactionID"],["count"],ag=False)
cols += encode_ag(xs,test_xs,["HrOfDay"],["TransactionID"],["count"],ag=False)

## Aggregation
cols_ag_av = ["D3n","M7","D9","D10n","D13n","D5n","D15n","D4n","D6n","D11n","HrOfDay","D14n","id_13"] ## cols from AV importance
cols_ag_other = ["TransactionAmt","cents"]
cols_C = ["C"+str(i) for i in range(1,15)]##v96
cols_M = ['M'+str(x) for x in range(1,10) if x!= 7]##v96

cols += encode_ag(xs,test_xs,uid,cols_ag_av+cols_ag_other, ["std","mean"])
cols += encode_ag(xs,test_xs,["card1","addr1","P_emaildomain"],cols_ag_av+cols_ag_other, ["std","mean"])##v95
cols += encode_ag(xs,test_xs,uid,cols_C, ["std","mean"]) ##v96

cols_ag_v = ["V9","V314","V3","V171","V81","V289","V88","V89","V271","V326","V7","V52","V1"]
cols += encode_ag(xs,test_xs,uid,cols_ag_v, ["std","mean"])
## Agg nunique

cols_ag_numb = ["addr2","ProductCD","dist1","DeviceType"]#,"P_emaildomain"]

cols += encode_ag(xs,test_xs,uid,cols_ag_numb+cols_M, ["nunique"])

## Final list of columns

In [None]:
cols = set_approach(cols,["DayNum"])

In [None]:
pd.DataFrame(xs[cols].dtypes).reset_index().sort_values("index").reset_index(drop=True)

In [None]:
len(cols)

In [None]:
## Fillna
xs.fillna(-9999, inplace=True)
test_xs.fillna(-9999, inplace=True)

## Local validation: Hold out

In [None]:
## variable to trigget getting FI
i=4
if i==4: get_feat_imp=True
get_feat_imp

In [None]:
## Linear split 80% 20%
msk = np.arange(len(xs))<0.8*len(xs) #usually 0.8, 0.8851 for the oversampling case
idxT = list(np.where(msk)[0])
idxV = list(np.where(~msk)[0])

In [None]:
%%time
## Run LV 5 times

if buildLv:
    get_feat_imp=False
    n=5
    preds = np.zeros(len(test_xs))
    auc_lst = []
    for i in range(n):
        print("iteation",i)
        if i==4: get_feat_imp=True
        a,temp = xgb_lvd_fun(get_feat_imp)
        preds += temp/n
        auc_lst.append(a)
        del temp; x=gc.collect()
        #print(a)
    auc_lst = np.array(auc_lst)
    print(auc_lst.mean(axis=0))
    print(auc_lst.std(axis=0))
    format_test_preds(preds,test_xs,"baselineLV")


## Local Validation: kfold

In [None]:
%%time
## Run 6 time kfold along with OOF prediction and fold based prediction of Test

if buildSkf:

    preds = xgb_pred_fun()
    format_test_preds(preds,test_xs,"baseline")
    #del preds; x=gc.collect()
    
#     ## Post_Proc
#     df_ptr = pd.concat([xs[["TransactionID"]+uid],y],axis=1)
#     print(df_ptr.sample())
#     df_pte = pd.concat([test_xs[["TransactionID"]+uid],pd.DataFrame({"isFraud":preds})],axis=1)
#     print(df_pte.sample())
#     encode_ag(df_ptr,df_pte,uid,["isFraud"],["mean"])
#     print(df_pte.sample())
#     format_test_preds(df_pte.isFraud_D1n_card1_addr1_mean2.to_list(),test_xs,"post_proc_")
#     del df_ptr, df_pte; x=gc.collect();

## Adverserial Validation

In [None]:
## AV
if AV:
    df_dom = pd.concat([xs[cols], test_xs[cols]])
    is_test = pd.DataFrame([0]*len(xs) + [1]*len(test_xs))
    idxT,idxV = train_test_split(np.arange(len(df_dom)),test_size=0.2)
    del xs, test_xs; x=gc.collect()

    clf = xgb.XGBClassifier(n_estimators=1000, 
                            max_depth=8, #12
                            learning_rate= 0.05,
                            subsample= 0.6,#0.8 
                            colsample_bytree= 0.4, 
                            random_state = r.randint(0,9999),
                            use_label_encoder=False,                            
    #                         #USE CPU
    #                         nthread=4)
                            # USE GPU
                            tree_method='gpu_hist')

    m = clf.fit(df_dom.iloc[idxT], is_test.iloc[idxT], eval_set=[(df_dom[cols].iloc[idxV],is_test.iloc[idxV])],
                    eval_metric= "auc", verbose=100, early_stopping_rounds=100)

    fi = xgb_fi(m, df_dom,df_dom)
    fi.to_csv("fi_av.csv",index=False)
    fi

In [None]:
fi