In [None]:
import pandas as pd
import numpy as np
import math
import sys

#For plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()

In [None]:
from os import listdir
from os.path import join

In [None]:
def read_from_files(files):
    frames = []
    for f in files:
        df = pd.read_csv(f)
        frames.append(df)
        del df
    df_final = pd.concat(frames)
    df_final.sort_values("TX_TIME_SECONDS")
    return df_final


In [None]:
DIR_INPUT = "/home/worldline/data/CSVagg/"

TRAIN_BEGIN = "aggtrx_20150424.csv"
TRAIN_END = "aggtrx_20150430.csv"

files_train = [join(DIR_INPUT, f) for f in listdir(DIR_INPUT) if f>=TRAIN_BEGIN and f<=TRAIN_END]

TEST_BEGIN = "aggtrx_20150508.csv"
TEST_END = "aggtrx_20150531.csv"

files_test = [join(DIR_INPUT, f) for f in listdir(DIR_INPUT) if f>=TEST_BEGIN and f<=TEST_END]

print("Load training files")
%time train_all=read_from_files(files_train)
del train_all["TX_ACCEPTED"]

train_all_1=train_all.copy(deep=True)
print("{} training files loaded, {} GB in memory".format(len(files_train),round(sys.getsizeof(train_all)/1073741824,2)))
print("Load test files")
%time test_all=read_from_files(files_test)
del test_all["TX_ACCEPTED"]

print("{} test files loaded, {} GB in memory".format(len(files_test),round(sys.getsizeof(test_all)/1073741824,2)))

In [None]:
def get_stats(df):
    #Number of transactions per day
    nb_tx_per_day=df.groupby(['TX_TIME_DAYS'])['CARD_PAN_ID'].count()
    #Number of fraudulent transactions per day
    nb_fraud_per_day=df.groupby(['TX_TIME_DAYS'])['TX_FRAUD'].sum()
    #Number of fraudulent cards per day
    nb_fraudcard_per_day=df[df['TX_FRAUD']>0].groupby(['TX_TIME_DAYS']).CARD_PAN_ID.nunique()
    
    return (nb_tx_per_day,nb_fraud_per_day,nb_fraudcard_per_day)

In [None]:
def factor_to_risk(df, feature_to_transform):
    for factor in feature_to_transform:
        print("Processing: "+factor)
        risk = df.groupby(factor).TX_FRAUD.mean().to_frame()
        risk.columns = ["RISK_" + factor]
        df = df.merge(risk, left_on=factor, right_index = True) 

    return df

In [None]:
def factor_from_previous_risk(df, df_past,feature_to_transform):
    for factor in feature_to_transform:
            riskfactor = "RISK_" + factor
            #dffact = df[[factor]]
            factor_risk = df_past[[factor, riskfactor]]
            factor_risk=factor_risk.drop_duplicates(subset=factor, keep='last')
            df = df.merge(factor_risk, how='left', on=factor)
            nbrna = df[riskfactor].isnull().sum()
            nbrrow = df.shape[0]
            print("No risk found for factor {}  in {}% of cases".format(riskfactor, (nbrna/nbrrow)*100 ))
    df.fillna(0, inplace=True)
    return df

In [None]:
def remove_fraud_training(train, test):
    train_fraud = train[(train.TX_FRAUD == True)].CARD_PAN_ID
    test = test [~ test.CARD_PAN_ID.isin(train_fraud)]
    return test

In [None]:
COL_NAME_BASE=['CARD_PAN_ID', 'TX_AMOUNT', 'TX_FRAUD', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 
              'TERM_MIDUID', 'TERM_MCC', 'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
               'GENDER', 'BROKER', 'CARD_BRAND']


In [None]:
train=train_all[COL_NAME_BASE]
test=test_all[COL_NAME_BASE]

In [None]:
feature_to_transform=['TERM_MIDUID', 'TERM_MCC', 'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
               'GENDER', 'BROKER', 'CARD_BRAND']

In [None]:
%time train=factor_to_risk(train,feature_to_transform)
%time test=factor_from_previous_risk(test,train,feature_to_transform)

In [None]:
test=remove_fraud_training(train, test)

In [None]:
test.info()

In [None]:
del test["TERM_MIDUID"]
del test['TERM_MCC']
del test['TERM_COUNTRY']
del test['TX_3D_SECURE']
del test['LANGUAGE']
del test['GENDER']
del test['BROKER']
del test['CARD_BRAND']


In [None]:
del train["TERM_MIDUID"]
del train['TERM_MCC']
del train['TERM_COUNTRY']
del train['TX_3D_SECURE']
del train['LANGUAGE']
del train['GENDER']
del train['BROKER']
del train['CARD_BRAND']

In [None]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [None]:
h2o.init(port=54331)

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
def compute_model_and_get_predictions(train, test, features, n_resample=10,random_state=0,trace=False):

    train_0=train[train.TX_FRAUD==0]
    train_1=train[train.TX_FRAUD==1]
    ndata_fraud=train_1.shape[0]
        
    predictions=np.zeros(shape=(test.shape[0],n_resample))
    #train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
    #   'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
     #  'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']
    resp_col='TX_FRAUD'
    model = H2ORandomForestEstimator(ntrees=10, max_depth=15, nfolds=10,
                                     binomial_double_trees=True, stopping_metric= "auc")
    
              
    #df_test=h2o.H2OFrame(test)
    for i in range(n_resample):
        if trace:
            print ("Round "+str(i))
        data0=train_0.sample(n=ndata_fraud,random_state=i+random_state,replace=True)
        data1=train_1.sample(n=ndata_fraud,random_state=i+n_resample+random_state,replace=True)
        data_train=data0.append(data1)
        df_train=h2o.H2OFrame(data_train)
        model.train(x=train_col, y=resp_col, training_frame=df_train)
        #df_test=h2o.H2OFrame(test)
        pred=model.predict(df_test)
        pr=h2o.h2o.as_list(pred, use_pandas=True)
        predictions[:,[i]]=pr.as_matrix()
    
        
    predictions=np.mean(predictions,1)
    return predictions

In [None]:
train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
       'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
       'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, train_col, n_resample=10, random_state=0,trace=False)


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

In [None]:
def simulate_fraud_detection_day_by_day(df_test, score, top,trace=False):
    ordered_days = list(df_test['TX_TIME_DAYS'].unique())
    ordered_days.sort()
    print("Number of days : {}".format(len(ordered_days)))
    known_fraud_cards = []
    result = []
    
    for day in ordered_days:
    
        df_test_day = df_test[(df_test['TX_TIME_DAYS'] == day)]
        df_test_day = df_test_day[[score, 'CARD_PAN_ID', 'TX_FRAUD']]
        df_test_day=df_test_day[~df_test_day.CARD_PAN_ID.isin(known_fraud_cards)]
        
        df_test_day = df_test_day.groupby('CARD_PAN_ID').max().sort_values(by=score, ascending=False).reset_index() 
        nbr_correct_pred = df_test_day.head(top).TX_FRAUD.sum()
        
        n_frauds=df_test_day.TX_FRAUD.sum()
        n_frauds_norm = min(top, n_frauds)
        
        percent_in_top = nbr_correct_pred / n_frauds_norm * 100
        auc_pr = round(average_precision_score(df_test_day.TX_FRAUD, df_test_day[score],'micro'),2)
        auc_roc = round(roc_auc_score(df_test_day.TX_FRAUD, df_test_day[score],'micro'),2)
        
        result.append([day,percent_in_top,auc_pr, auc_roc])
        if trace:
            print("Fraud in top",str(top),"found in day {}:  {}% ({} out of {}, with ".format(day,percent_in_top,nbr_correct_pred,n_frauds)+str(top)+" as upper limit). AUC PR: {}. AUC ROC: {}.".format(auc_pr, auc_roc))
        df_top=df_test_day.head(top)
        known_fraud_cards = known_fraud_cards + list(df_top[df_top.TX_FRAUD==1].CARD_PAN_ID)
    
    return np.array(result)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

# Results (Basic Features)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

In [None]:
h2o.cluster().shutdown()

# More features

In [None]:
COL_NAME_BASE=['CARD_PAN_ID', 'TX_AMOUNT', 'TX_FRAUD', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 
              'TERM_MIDUID', 'TERM_MCC', 'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
               'GENDER', 'BROKER', 'CARD_BRAND', 'MIN_AMT_LAST_24H', 'SUM_AMT_LAST_24H',
               'TX_DIFF_LAST_TX', 'TERM_REGION', 'TERM_CONTINENT','TERM_MCCG', 'TERM_MCC_GROUP', 
               'LAST_MIDUID_TX', 'LAST_COUNTRY_TX', 'LAST_MCC_HIS','NB_TRX_LAST_24H']

In [None]:
train=train_all[COL_NAME_BASE]
test=test_all[COL_NAME_BASE]

In [None]:
feature_to_transform=['TERM_MIDUID', 'TERM_MCC', 'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
                       'GENDER', 'BROKER', 'CARD_BRAND', 'LAST_MIDUID_TX', 'LAST_COUNTRY_TX', 
                      'LAST_MCC_HIS', 'TERM_REGION', 'TERM_CONTINENT', 'TERM_MCCG', 'TERM_MCC_GROUP']


In [None]:
%time train=factor_to_risk(train,feature_to_transform)
%time test=factor_from_previous_risk(test,train,feature_to_transform)

In [None]:
test=remove_fraud_training(train, test)

In [None]:
test.info()

In [None]:
del test["TERM_MIDUID"]
del test['TERM_MCC']
del test['TERM_COUNTRY']
del test['TX_3D_SECURE']
del test['LANGUAGE']
del test['GENDER']
del test['BROKER']
del test['CARD_BRAND']
#del test['TX_TIME_HOURS']
del test['TERM_REGION']
del test['TERM_CONTINENT']
del test['TERM_MCCG']
del test['TERM_MCC_GROUP']
del test['LAST_MIDUID_TX']
del test['LAST_COUNTRY_TX']
del test['LAST_MCC_HIS']

In [None]:
del train["TERM_MIDUID"]
del train['TERM_MCC']
del train['TERM_COUNTRY']
del train['TX_3D_SECURE']
del train['LANGUAGE']
del train['GENDER']
del train['BROKER']
del train['CARD_BRAND']
#del test['TX_TIME_HOURS']
del train['TERM_REGION']
del train['TERM_CONTINENT']
del train['TERM_MCCG']
del train['TERM_MCC_GROUP']
del train['LAST_MIDUID_TX']
del train['LAST_COUNTRY_TX']
del train['LAST_MCC_HIS']

In [None]:
features=['TX_AMOUNT', 'MIN_AMT_LAST_24H', 'SUM_AMT_LAST_24H',
       'TX_DIFF_LAST_TX', 'NB_TRX_LAST_24H','RISK_TERM_MIDUID', 'RISK_TERM_MCC',
       'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
       'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND', 'RISK_LAST_MIDUID_TX',
       'RISK_LAST_COUNTRY_TX', 'RISK_LAST_MCC_HIS', 'RISK_TERM_REGION', 'RISK_TERM_CONTINENT',
       'RISK_TERM_MCCG', 'RISK_TERM_MCC_GROUP']

In [None]:
h2o.init(port=54331)

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, features, n_resample=10, random_state=0,trace=False)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

# Results (all features)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

In [None]:
h2o.cluster().shutdown()

# 50 trees (all features)

In [None]:
h2o.init(port=54331)

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
def compute_model_and_get_predictions(train, test, features, n_resample=50,random_state=0,trace=False):

    train_0=train[train.TX_FRAUD==0]
    train_1=train[train.TX_FRAUD==1]
    ndata_fraud=train_1.shape[0]
        
    predictions=np.zeros(shape=(test.shape[0],n_resample))
    #train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
    #   'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
     #  'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']
    resp_col='TX_FRAUD'
    model = H2ORandomForestEstimator(ntrees=50, max_depth=15, nfolds=10,
                                     binomial_double_trees=True, stopping_metric= "auc")
    
              
    #df_test=h2o.H2OFrame(test)
    for i in range(n_resample):
        if trace:
            print ("Round "+str(i))
        data0=train_0.sample(n=ndata_fraud,random_state=i+random_state,replace=True)
        data1=train_1.sample(n=ndata_fraud,random_state=i+n_resample+random_state,replace=True)
        data_train=data0.append(data1)
        df_train=h2o.H2OFrame(data_train)
        model.train(x=train_col, y=resp_col, training_frame=df_train)
        #df_test=h2o.H2OFrame(test)
        pred=model.predict(df_test)
        pr=h2o.h2o.as_list(pred, use_pandas=True)
        predictions[:,[i]]=pr.as_matrix()
    
        
    predictions=np.mean(predictions,1)
    return predictions

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, features, n_resample=50, random_state=0,trace=False)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

# Results (all features 50 trees/50 runs)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

In [None]:
h2o.cluster().shutdown()

# Basic features (50 trees/50runs)

In [None]:
h2o.init(port=54331)

In [None]:
train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
       'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
       'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
def compute_model_and_get_predictions(train, test, features, n_resample=50,random_state=0,trace=False):

    train_0=train[train.TX_FRAUD==0]
    train_1=train[train.TX_FRAUD==1]
    ndata_fraud=train_1.shape[0]
        
    predictions=np.zeros(shape=(test.shape[0],n_resample))
    #train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
    #   'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
     #  'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']
    resp_col='TX_FRAUD'
    model = H2ORandomForestEstimator(ntrees=50, max_depth=15, nfolds=10,
                                     binomial_double_trees=True, stopping_metric= "auc")
    
              
    #df_test=h2o.H2OFrame(test)
    for i in range(n_resample):
        if trace:
            print ("Round "+str(i))
        data0=train_0.sample(n=ndata_fraud,random_state=i+random_state,replace=True)
        data1=train_1.sample(n=ndata_fraud,random_state=i+n_resample+random_state,replace=True)
        data_train=data0.append(data1)
        df_train=h2o.H2OFrame(data_train)
        model.train(x=train_col, y=resp_col, training_frame=df_train)
        #df_test=h2o.H2OFrame(test)
        pred=model.predict(df_test)
        pr=h2o.h2o.as_list(pred, use_pandas=True)
        predictions[:,[i]]=pr.as_matrix()
    
        
    predictions=np.mean(predictions,1)
    return predictions

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, train_col, n_resample=50, random_state=0,trace=False)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

 # Results (Basic features 50/50)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

In [None]:
h2o.cluster().shutdown()

# Variables without transformation (basic features)

In [None]:
train.info()
test.info()

In [None]:
test=remove_fraud_training(train, test)

In [None]:
h2o.init(port=54331)

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
train_col=['TX_AMOUNT', 'TERM_MIDUID', 'TERM_MCC',
       'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
       'GENDER', 'BROKER', 'CARD_BRAND']

In [None]:
def compute_model_and_get_predictions(train, test, features, n_resample=10,random_state=0,trace=False):

    train_0=train[train.TX_FRAUD==0]
    train_1=train[train.TX_FRAUD==1]
    ndata_fraud=train_1.shape[0]
        
    predictions=np.zeros(shape=(test.shape[0],n_resample))
    #train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
    #   'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
     #  'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']
    resp_col='TX_FRAUD'
    model = H2ORandomForestEstimator(ntrees=10, max_depth=15, nfolds=10,
                                     binomial_double_trees=True, stopping_metric= "auc")
    
              
    #df_test=h2o.H2OFrame(test)
    for i in range(n_resample):
        if trace:
            print ("Round "+str(i))
        data0=train_0.sample(n=ndata_fraud,random_state=i+random_state,replace=True)
        data1=train_1.sample(n=ndata_fraud,random_state=i+n_resample+random_state,replace=True)
        data_train=data0.append(data1)
        df_train=h2o.H2OFrame(data_train)
        model.train(x=train_col, y=resp_col, training_frame=df_train)
        #df_test=h2o.H2OFrame(test)
        pred=model.predict(df_test)
        pr=h2o.h2o.as_list(pred, use_pandas=True)
        predictions[:,[i]]=pr.as_matrix()
    
        
    predictions=np.mean(predictions,1)
    return predictions

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, train_col, n_resample=10, random_state=0,trace=False)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

# Results (without transformation 10/10)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

In [None]:
h2o.cluster().shutdown()

# All variables (without transformation)

In [None]:
COL_NAME_BASE=['CARD_PAN_ID', 'TX_AMOUNT', 'TX_FRAUD', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 
              'TERM_MIDUID', 'TERM_MCC', 'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
               'GENDER', 'BROKER', 'CARD_BRAND', 'MIN_AMT_LAST_24H', 'SUM_AMT_LAST_24H',
               'TX_DIFF_LAST_TX', 'TERM_REGION', 'TERM_CONTINENT','TERM_MCCG', 'TERM_MCC_GROUP', 
               'LAST_MIDUID_TX', 'LAST_COUNTRY_TX', 'LAST_MCC_HIS','NB_TRX_LAST_24H']

In [None]:
train=train_all[COL_NAME_BASE]
test=test_all[COL_NAME_BASE]

In [None]:
test=remove_fraud_training(train, test)

In [None]:
features=['TX_AMOUNT', 'MIN_AMT_LAST_24H', 'SUM_AMT_LAST_24H',
       'TX_DIFF_LAST_TX', 'NB_TRX_LAST_24H','TERM_MIDUID', 'TERM_MCC',
       'TERM_COUNTRY', 'TX_3D_SECURE', 'LANGUAGE',
       'GENDER', 'BROKER', 'CARD_BRAND', 'LAST_MIDUID_TX',
       'LAST_COUNTRY_TX', 'LAST_MCC_HIS', 'TERM_REGION', 'TERM_CONTINENT',
       'TERM_MCCG', 'TERM_MCC_GROUP']

In [None]:
h2o.init(port=54331)

In [None]:
%time df_test=h2o.H2OFrame(test)

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, features, n_resample=10, random_state=0,trace=False)

In [None]:
%time results=simulate_fraud_detection_day_by_day(test,'predictions',100)

# Results (all features without transformation 10/10)

In [None]:
print(np.mean(results,0))
print(np.std(results,0))

# All features without transformation 50/50

In [None]:
def compute_model_and_get_predictions(train, test, features, n_resample=50,random_state=0,trace=False):

    train_0=train[train.TX_FRAUD==0]
    train_1=train[train.TX_FRAUD==1]
    ndata_fraud=train_1.shape[0]
        
    predictions=np.zeros(shape=(test.shape[0],n_resample))
    #train_col=['TX_AMOUNT', 'RISK_TERM_MIDUID', 'RISK_TERM_MCC',
    #   'RISK_TERM_COUNTRY', 'RISK_TX_3D_SECURE', 'RISK_LANGUAGE',
     #  'RISK_GENDER', 'RISK_BROKER', 'RISK_CARD_BRAND']
    resp_col='TX_FRAUD'
    model = H2ORandomForestEstimator(ntrees=50, max_depth=15, nfolds=10,
                                     binomial_double_trees=True, stopping_metric= "auc")
    
              
    #df_test=h2o.H2OFrame(test)
    for i in range(n_resample):
        if trace:
            print ("Round "+str(i))
        data0=train_0.sample(n=ndata_fraud,random_state=i+random_state,replace=True)
        data1=train_1.sample(n=ndata_fraud,random_state=i+n_resample+random_state,replace=True)
        data_train=data0.append(data1)
        df_train=h2o.H2OFrame(data_train)
        model.train(x=train_col, y=resp_col, training_frame=df_train)
        #df_test=h2o.H2OFrame(test)
        pred=model.predict(df_test)
        pr=h2o.h2o.as_list(pred, use_pandas=True)
        predictions[:,[i]]=pr.as_matrix()
    
        
    predictions=np.mean(predictions,1)
    return predictions

In [None]:
%time test['predictions']=compute_model_and_get_predictions(train, test, features, n_resample=50, random_state=0,trace=False)