In [19]:
from logger import Logger

import pandas as pd
import numpy as np
import os

In [20]:
logger = Logger(show = True, html_output = True, config_file = "config.txt")

logger.log("Reading transactions file...")
df = pd.read_csv(logger.get_data_file(logger.config_dict['DATA_FILE']))
logger.log("Finish reading {} rows".format(df.shape[0]), show_time = True)
print(os.linesep + "Data snippet")
df.head()

2019.05.09-06:48:38 Initialize the logger
2019.05.09-06:48:38 Create logs folder logs
2019.05.09-06:48:38 Create log file logs\2019-05-09_06_48_38_log0.html
2019.05.09-06:48:38 Create models folder models
2019.05.09-06:48:38 Create output folder output
2019.05.09-06:48:38 Create data folder data
2019.05.09-06:48:38 Read config file config.txt
[2019.05.09-06:48:38] Reading transactions file...
[2019.05.09-06:48:39] Finish reading 290382 rows [1.06s]

Data snippet


Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
0,1,2015-11-09 14:26:51,MX,mccredit,530056.0,64800.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-01 23:03:11,MexicoAccount,email68370,ip111778,card184798
1,2,2015-11-09 14:27:38,MX,mccredit,547046.0,44900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-02 04:50:55,MexicoAccount,email101299,ip78749,card151595
2,3,2015-11-23 16:34:16,MX,mccredit,528843.0,149900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-02 14:30:28,MexicoAccount,email278604,ip70594,card242142
3,4,2015-11-23 16:34:51,MX,mccredit,547146.0,109900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-03 07:53:37,MexicoAccount,email47409,ip113648,card181744
4,5,2015-11-09 14:26:08,MX,visaclassic,477291.0,89900.0,MXN,MX,Ecommerce,Chargeback,True,0,2015-07-08 18:35:35,MexicoAccount,email205501,ip83553,card97271


In [21]:
crt_size = df.shape[0]

df['mail_id'] = df['mail_id'].str.replace('email','')
df.drop(df[df.mail_id == "NA"].index, inplace=True)
df['mail_id'] = pd.to_numeric(df['mail_id'])
logger.log("Dropping NA in email id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

df['ip_id']   = df['ip_id'].str.replace('ip','')
df.drop(df[df.ip_id == "NA"].index, inplace=True)
df['ip_id'] = pd.to_numeric(df['ip_id'])
logger.log("Dropping NA in ip id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

df['card_id'] = df['card_id'].str.replace('card','')
df.drop(df[df.card_id == "NA"].index, inplace=True)
df['card_id'] = pd.to_numeric(df['card_id'])
logger.log("Dropping NA in card id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

[2019.05.09-06:48:39] Dropping NA in email id 366
[2019.05.09-06:48:40] Dropping NA in ip id 0
[2019.05.09-06:48:40] Dropping NA in card id 0


In [22]:
df.drop(df[df.simple_journal == "Refused"].index, inplace=True)
logger.log("Dropping REFUSED transaction {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0] 

[2019.05.09-06:48:41] Dropping REFUSED transaction 53318


In [23]:
logger.log("Number of NaNs per column:" + os.linesep + "{}".format(df.isna().sum()))

[2019.05.09-06:48:41] Number of NaNs per column:
txid                                0
bookingdate                         0
issuercountrycode                   4
txvariantcode                       0
bin                                 0
amount                              0
currencycode                        0
shoppercountrycode                342
shopperinteraction                  0
simple_journal                      0
cardverificationcodesupplied    13194
cvcresponsecode                     0
creationdate                        0
accountcode                         0
mail_id                             0
ip_id                               0
card_id                             0
dtype: int64


In [24]:
print(os.linesep + "Few examples of NaNs rows:")
df[df.isna().any(axis=1)].head()


Few examples of NaNs rows:


Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
25297,39506,2015-09-27 20:21:39,MX,mccredit,528843.0,61530.0,MXN,,Ecommerce,Settled,True,0,2015-09-27 20:21:27,MexicoAccount,263004.0,86356.0,39750.0
25592,40058,2015-09-30 06:32:50,,mc,520116.0,119940.0,MXN,MX,Ecommerce,Settled,True,0,2015-09-30 06:32:40,MexicoAccount,47407.0,61517.0,46742.0
27227,43731,2015-10-19 20:40:32,,mc,520116.0,44800.0,MXN,MX,Ecommerce,Settled,True,0,2015-10-19 20:40:16,MexicoAccount,47407.0,61517.0,46742.0
27307,43927,2015-10-20 20:30:51,,mc,520116.0,111920.0,MXN,MX,Ecommerce,Settled,True,0,2015-10-20 20:30:43,MexicoAccount,164167.0,76472.0,269591.0
31536,50220,2015-07-02 04:35:15,AU,mccredit,535316.0,11000.0,AUD,AU,Ecommerce,Settled,,0,2015-07-01 03:45:57,APACAccount,267790.0,17828.0,194191.0


In [25]:
logger.log("NaNs from issuercountrycode that are Chargeback: {}".format(
    (df[df['issuercountrycode'].isna()].simple_journal == "Chargeback").sum()))
logger.log("NaNs from shoppercountrycode that are Chargeback: {}".format(
    (df[df['shoppercountrycode'].isna()].simple_journal == "Chargeback").sum()))
logger.log("NaNs from cardverificationcodesupplied that are Chargeback: {}".format(
    (df[df['cardverificationcodesupplied'].isna()].simple_journal == "Chargeback").sum()))

[2019.05.09-06:48:41] NaNs from issuercountrycode that are Chargeback: 0
[2019.05.09-06:48:41] NaNs from shoppercountrycode that are Chargeback: 0
[2019.05.09-06:48:41] NaNs from cardverificationcodesupplied that are Chargeback: 0


In [26]:
df.dropna(axis = 0, inplace = True)
logger.log("Drop the other NaNs: {} entries".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

[2019.05.09-06:48:41] Drop the other NaNs: 13526 entries


In [27]:
logger.log("Convert datatypes for numeric and timestamps")

df['bookingdate']  = pd.to_datetime(df['bookingdate'])
df['creationdate'] = pd.to_datetime(df['creationdate'])
df = df.infer_objects()

df['bin'] = df['bin'].astype(object)
df['mail_id'] = df['mail_id'].astype(object)
df['card_id'] = df['card_id'].astype(object)
df['ip_id'] = df['ip_id'].astype(object)

print(os.linesep + "{}".format(df.dtypes))

[2019.05.09-06:48:41] Convert datatypes for numeric and timestamps

txid                                     int64
bookingdate                     datetime64[ns]
issuercountrycode                       object
txvariantcode                           object
bin                                     object
amount                                 float64
currencycode                            object
shoppercountrycode                      object
shopperinteraction                      object
simple_journal                          object
cardverificationcodesupplied              bool
cvcresponsecode                          int64
creationdate                    datetime64[ns]
accountcode                             object
mail_id                                 object
ip_id                                   object
card_id                                 object
dtype: object


In [28]:
logger.log("Sort after creationdate")
df.sort_values('creationdate', inplace = True)
df.reset_index(drop = True, inplace = True)

[2019.05.09-06:48:41] Sort after creationdate


In [29]:
logger.log("Currencies identified in data are {}".format(np.unique(df.currencycode.values)))

conversion = {'AUD': 0.699165, 'GBP': 1.31061, 'MXN': 0.222776586, 'NZD': 0.66152, 'SEK': 0.104405}
logger.log("Convert all amounts in USD using rates: {}".format(conversion))
df['amount'] = df.apply(lambda e: e['amount'] * conversion[e['currencycode']], axis=1)
logger.log("Done converting", show_time = True)

[2019.05.09-06:48:42] Currencies identified in data are ['AUD' 'GBP' 'MXN' 'NZD' 'SEK']
[2019.05.09-06:48:42] Convert all amounts in USD using rates: {'AUD': 0.699165, 'GBP': 1.31061, 'MXN': 0.222776586, 'NZD': 0.66152, 'SEK': 0.104405}
[2019.05.09-06:48:47] Done converting [5.01s]


In [30]:
logger.log("Drop txid and bookingdate")
df.drop(["txid", "bookingdate"], inplace = True, axis = 1)

logger.log("Change simple_journal to label")
df.replace({'simple_journal': {"Settled": 0, "Chargeback": 1}}, inplace = True)
df.rename(columns = {'simple_journal': 'label'}, inplace = True)

logger.log("Rearrange columns")
columns = ["creationdate", "card_id", "mail_id", "ip_id", "issuercountrycode", "txvariantcode", 
           "bin", "shoppercountrycode", "shopperinteraction", "cardverificationcodesupplied", 
           "cvcresponsecode", "accountcode", "amount", "currencycode", "label"]
df = df[columns]

[2019.05.09-06:48:47] Drop txid and bookingdate
[2019.05.09-06:48:47] Change simple_journal to label
[2019.05.09-06:48:47] Rearrange columns


In [31]:
from feats_utils import extract_features_from_data

logger.log("Start extracting additional features")
additional_feats_df = extract_features_from_data(df)
logger.log("Finished extracting additional features", show_time = True)

[2019.05.09-06:48:47] Start extracting additional features
[2019.05.09-06:49:36] Finished extracting additional features [49.58s]


In [32]:
full_df = pd.concat([df, additional_feats_df], axis = 1)

full_df_non_fraud = full_df[full_df.label == 0]
full_df_fraud = full_df[full_df.label == 1]

full_df_fraud.iloc[:, -10:].describe()
full_df_non_fraud.iloc[:, -10:].describe()

Unnamed: 0,label,Txn_Amount_Month,Average_3Months,Average_DailyMonth,Amount_SameDay,Number_Same_Day,Amount_Currency_Type_Month,Number_Currency_Type_Month,Amount_Country_Type_Month,Number_Country_Type_Month
count,222827.0,222827.0,222827.0,222827.0,222827.0,222827.0,222827.0,222827.0,222827.0,222827.0
mean,0.0,921.929208,222.302,59.32076,302.0377,0.030059,1778.468,0.189308,1688.29,0.170262
std,0.0,5318.25663,1714.451,541.0362,4967.229,0.437707,16229.47,2.174842,15220.17,1.801616
min,0.0,0.0,-3.031649e-13,-9.701277e-13,-2.910383e-11,0.0,-2.910383e-11,0.0,-3.910827e-11,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,318995.92095,79070.74,29820.96,440365.0,25.0,894628.9,99.0,831372.3,92.0


In [36]:
print(os.linesep + "Snippet of data after preprocessing")
full_df.iloc[123412:123417, :]


Snippet of data after preprocessing


Unnamed: 0,creationdate,card_id,mail_id,ip_id,issuercountrycode,txvariantcode,bin,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,...,label,Txn_Amount_Month,Average_3Months,Average_DailyMonth,Amount_SameDay,Number_Same_Day,Amount_Currency_Type_Month,Number_Currency_Type_Month,Amount_Country_Type_Month,Number_Country_Type_Month
123412,2015-09-04 13:15:16,129055,112364,234838,GB,visadebit,475116,GB,Ecommerce,True,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
123413,2015-09-04 13:15:17,128053,335613,157557,AU,visaclassic,450949,AU,Ecommerce,True,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
123414,2015-09-04 13:15:54,119212,307511,255562,GB,mccredit,540758,GB,Ecommerce,True,...,0,7716.216375,1286.036063,514.414425,0.0,0,15432.43275,2,15432.43275,2
123415,2015-09-04 13:16:07,81318,12338,232898,GB,mccredit,512687,GB,Ecommerce,True,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
123416,2015-09-04 13:16:34,269983,159864,342982,SE,mcdebit,534243,SE,Ecommerce,True,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
