In [1]:
import os
FRAUD_PATH = os.path.join("datasets","fraud")

In [2]:
import pandas as pd

def load_project_data(filename, fraud_path=FRAUD_PATH):
    csv_path = os.path.join(fraud_path,filename)
    return pd.read_csv(csv_path)


In [3]:
train_identity = load_project_data("train_identity.csv")
train_transactions = load_project_data("train_transaction.csv")

In [4]:
import gc
train=train_transactions.merge(train_identity,how='left',on='TransactionID')


del train_transactions,train_identity
gc.collect

print('Train shape ',train.shape)








Train shape  (590540, 434)


In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
import numpy as np
train= reduce_mem_usage(train)




Mem. usage decreased to 650.48 Mb (66.8% reduction)


In [7]:
from sklearn.model_selection import train_test_split
    
X_train,X_test = train_test_split(train,test_size=0.2,random_state=42)

y_train = X_train['isFraud'].copy()
y_test = X_test['isFraud'].copy()
X_train.drop(['isFraud'],axis=1,inplace=True)
X_test.drop(['isFraud'],axis=1,inplace=True)


del train
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


0

In [8]:
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        fq_encode = df.value_counts(dropna=True, normalize=True).to_dict()
        
        df1[col] = df1[col].map(fq_encode)
        df1[col] = df1[col].astype('float32')
        df2[col]  = df2[col].map(fq_encode)
        df2[col] = df2[col].astype('float32')
    

In [9]:
def addUids(df): 
    df['uid'] = df['card1'].astype(str)+'_'+df['addr1'].astype(str)

    df['uid2'] = df['uid'].astype(str)+'_'+df['card3'].astype(str)+'_'+df['card5'].astype(str)

    df['uid3'] = df['uid2'].astype(str)+'_'+df['card2'].astype(str)+'_'+df['addr2'].astype(str)

    
    
    return df

X_train = addUids(X_train)
X_test = addUids(X_test)

In [10]:
agg_cols = ['uid','uid2','uid3']
for col in agg_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([X_train[[col, 'TransactionAmt']], X_test[[col,'TransactionAmt']]])
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})

        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   

        X_train[new_col_name] = X_train[col].map(temp_df)
        X_test[new_col_name]  = X_test[col].map(temp_df)
        X_train[new_col_name]= X_train[new_col_name].replace(np.inf,X_train[new_col_name].median())
        X_test[new_col_name]= X_test[new_col_name].replace(np.inf,X_test[new_col_name].median())

In [11]:
def emailMatch(df):
    df['email_match'] = np.where((df.P_emaildomain==df.R_emaildomain),1,0)
    return df

In [12]:
X_train = emailMatch(X_train)
X_test = emailMatch(X_test)


In [13]:
import datetime
def setTime(df):
    START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
    
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['DT'].dt.year-2017)*12 + df['DT'].dt.month
    df['DT_W'] = (df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear
    df['DT_D'] = (df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day
    
    return df
    
X_train=setTime(X_train)
X_test=setTime(X_test)

In [14]:
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([X_train[[col]], X_test[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    X_train[col+'_total'] = X_train[col].map(fq_encode)
    X_test[col+'_total']  = X_test[col].map(fq_encode)
    
periods = ['DT_M','DT_W','DT_D']
i_cols = ['uid3']
for period in periods:
    for col in i_cols:
        new_column = col + '_' + period
            
        temp_df = pd.concat([X_train[[col,period]], X_test[[col,period]]])
        temp_df[new_column] = temp_df[col].astype(str) + '_' + (temp_df[period]).astype(str)
        fq_encode = temp_df[new_column].value_counts().to_dict()
            
        X_train[new_column] = (X_train[col].astype(str) + '_' + X_train[period].astype(str)).map(fq_encode)
        X_test[new_column]  = (X_test[col].astype(str) + '_' + X_test[period].astype(str)).map(fq_encode)
        
        X_train[new_column] /= X_train[period+'_total']
        X_test[new_column]  /= X_test[period+'_total']

In [15]:
cat = ['ProductCD','card4','card6','addr1','addr2','P_emaildomain','R_emaildomain',
       'M1','M2','M3','M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','id_12','id_13',
      'id_14','id_15','id_16','id_17','id_18','id_19','id_20','id_21','id_22','id_23','id_24',
       'id_25','id_26','id_27','id_28','id_29','id_30','id_31','id_32','id_33','id_34','id_35',
       'id_36','id_37','id_38','uid','uid2','uid3']

encode_FE(X_train,X_test,cat)


In [16]:
def many_null_var(data):
    many_null_cols = [col for col in data.columns if data[col].isnull().sum() / data.shape[0] > 0.9]
    return many_null_cols

def many_repeated_val(data):
    big_top_value_cols = [col for col in data.columns if data[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return big_top_value_cols

def get_useless_columns(data):
    too_many_null = many_null_var(data)
    print("More than 90% null: " + str(len(too_many_null)))
    too_many_repeated = many_repeated_val(data)
    print("More than 90% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(too_many_null + too_many_repeated))
    
    return cols_to_drop

cols_to_drop = get_useless_columns(X_train)

More than 90% null: 12
More than 90% repeated value: 66


In [17]:
X_train = X_train.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(472432, 390)
(118108, 390)
(472432,)


In [18]:
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [19]:
print(X_train.isnull().sum().max())
print(X_test.isnull().sum().max())

0
0


In [20]:
X_train = X_train.drop(['TransactionDT', 'DT'], axis=1)
X_test = X_test.drop(['TransactionDT', 'DT'], axis=1)

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score,recall_score,confusion_matrix

# X_train = pd.read_csv('train_split_eng.csv')
# y_train = pd.read_csv('y_train_split_eng.csv')
# X_test = pd.read_csv('test_split_eng.csv')
# y_test = pd.read_csv('y_test_split_eng.csv')

In [28]:
import optuna 

N_TRIALS = 100
def objective(trial):
    X_dtrain, X_valid, y_dtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    dtrain = xgb.DMatrix(X_dtrain, label=y_dtrain)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    param = {
        "silent": 1,
        "tree_method":"hist",
        "seed" : 42,
        "objective": "binary:logistic",
        "max_depth" : trial.suggest_int("max_depth", 1, 9),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "eta" : trial.suggest_loguniform("eta", 1e-8, 1.0),
        "gamma" : trial.suggest_loguniform("gamma", 1e-8, 1.0)
    }

   

    bst = xgb.train(param, dtrain)
    predictions = bst.predict(dvalid)
    
    roc = roc_auc_score(y_valid, predictions)
    return roc



study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_trial)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2020-05-15 15:37:41,011] Finished trial#0 with value: 0.7423307764773912 with parameters: {'max_depth': 2, 'lambda': 0.19071036037446856, 'alpha': 3.877662872085165e-05, 'eta': 0.08051292653938749, 'gamma': 1.9879570421946918e-05}. Best is trial#0 with value: 0.7423307764773912.
[I 2020-05-15 15:37:52,027] Finished trial#1 with value: 0.7772536028649646 with parameters: {'max_depth': 6, 'lambda': 0.0001390476057318567, 'alpha': 1.0373302449279087e-06, 'eta': 1.5032463253426956e-07, 'gamma': 0.00045305983528810806}. Best is trial#1 with value: 0.7772536028649646.
[I 2020-05-15 15:38:05,141] Finished trial#2 with value: 0.845135305575374 with parameters: {'max_depth': 9, 'lambda': 1.3050305335589883e-06, 'alpha': 1.8962012367591105e-05, 'eta': 0.00042139763431331845, 'gamma': 6.569939245975586e-05}. Best is trial#2 with value: 0.845135305575374.
[I 2020-05-15 15:38:20,252] Finished trial#3 with value: 0.844527356761593 with parameters: {'max_depth': 9, 'lambda': 0.04162975521943276, '

[I 2020-05-15 15:48:37,693] Finished trial#56 with value: 0.9090355770444849 with parameters: {'max_depth': 8, 'lambda': 0.013670590155651925, 'alpha': 0.05845756233213682, 'eta': 0.45990866176920864, 'gamma': 2.7592790773227965e-05}. Best is trial#51 with value: 0.9172386201714754.
[I 2020-05-15 15:48:49,684] Finished trial#57 with value: 0.8457735974753258 with parameters: {'max_depth': 9, 'lambda': 0.036379926710835175, 'alpha': 0.04914043491662289, 'eta': 0.00010190821405679674, 'gamma': 0.00021162719014338602}. Best is trial#51 with value: 0.9172386201714754.
[I 2020-05-15 15:49:01,558] Finished trial#58 with value: 0.9128835258838901 with parameters: {'max_depth': 8, 'lambda': 0.011114992780530861, 'alpha': 0.01829654158642477, 'eta': 0.4303407216449489, 'gamma': 2.7200528014256025e-06}. Best is trial#51 with value: 0.9172386201714754.
[I 2020-05-15 15:49:13,360] Finished trial#59 with value: 0.8829091121732372 with parameters: {'max_depth': 8, 'lambda': 0.030457328254069726, 'al

FrozenTrial(number=96, value=0.9295616354707645, datetime_start=datetime.datetime(2020, 5, 15, 15, 56, 28, 116344), datetime_complete=datetime.datetime(2020, 5, 15, 15, 56, 41, 281511), params={'max_depth': 9, 'lambda': 0.13566321238073373, 'alpha': 2.7493796467413147e-06, 'eta': 0.7325531701663303, 'gamma': 2.357793178002584e-06}, distributions={'max_depth': IntUniformDistribution(high=9, low=1, step=1), 'lambda': LogUniformDistribution(high=1.0, low=1e-08), 'alpha': LogUniformDistribution(high=1.0, low=1e-08), 'eta': LogUniformDistribution(high=1.0, low=1e-08), 'gamma': LogUniformDistribution(high=1.0, low=1e-08)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=96, state=TrialState.COMPLETE)
Number of finished trials: 100
Best trial:
  Value: 0.9295616354707645
  Params: 
    max_depth: 9
    lambda: 0.13566321238073373
    alpha: 2.7493796467413147e-06
    eta: 0.7325531701663303
    gamma: 2.357793178002584e-06


In [22]:
import joblib
# joblib.dump(study, 'xgb_final.pkl')

In [23]:
xgb_clf = joblib.load('xgb_final.pkl')

In [24]:
params = xgb_clf.best_trial.params
params.update({
        "tree_method":"hist",
        "seed" : 42,
        "objective": "binary:logistic",})
params

{'max_depth': 9,
 'lambda': 0.13566321238073373,
 'alpha': 2.7493796467413147e-06,
 'eta': 0.7325531701663303,
 'gamma': 2.357793178002584e-06,
 'tree_method': 'hist',
 'seed': 42,
 'objective': 'binary:logistic'}

In [25]:
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))

NFOLDS = 5
cols = X_train.columns
skf = KFold(n_splits=NFOLDS)
for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train) ):
    
   
    clf = xgb.XGBClassifier(
             max_depth= 9,
             reg_lambda= 0.13566321238073373,
             reg_alpha= 2.7493796467413147e-06,
             learning_rate= 0.7325531701663303,
             gamma= 2.357793178002584e-06,
             tree_method= 'hist',
             seed= 42,
             objective= 'binary:logistic',
    )        
    h = clf.fit(X_train[cols].iloc[idxT], y_train.iloc[idxT], 
            eval_set=[(X_train[cols].iloc[idxV],y_train.iloc[idxV])],
            verbose=100, early_stopping_rounds=200,eval_metric='auc')
    
    oof[idxV] += clf.predict_proba(X_train[cols].iloc[idxV])[:,1]
    preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits
    del h, clf
    x=gc.collect()
print('#'*20)
print ('Out of folds AUC=',roc_auc_score(y_train,oof))

[0]	validation_0-auc:0.840505
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.950042
[0]	validation_0-auc:0.810599
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.95345
[0]	validation_0-auc:0.817011
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.950207
[0]	validation_0-auc:0.807011
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.948529
[0]	validation_0-auc:0.841213
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.950008
####################
Out of folds AUC= 0.9505259444672531
