In [1]:
import pandas as pd
import numpy as np
from logger import Logger

In [2]:
logger = Logger(show = True, html_output = True, config_file = "config.txt")

logger.log("Reading transactions file...")
df = pd.read_csv(logger.get_data_file(logger.config_dict['DATA_FILE']))
logger.log("Finish reading {} rows".format(df.shape[0]), show_time = True)

2019.05.08-23:01:44 Initialize the logger
2019.05.08-23:01:44 Create logs folder logs
2019.05.08-23:01:44 Create log file logs/2019-05-08_23_01_44_log0.html
2019.05.08-23:01:44 Create models folder models
2019.05.08-23:01:44 Create output folder output
2019.05.08-23:01:44 Create data folder data
2019.05.08-23:01:44 Read config file config.txt
[2019.05.08-23:01:44] Reading transactions file...
[2019.05.08-23:01:46] Finish reading 290382 rows [1.26s]


In [3]:
df.head()
crt_size = df.shape[0]

In [4]:
df['mail_id'] = df['mail_id'].str.replace('email','')
df.drop(df[df.mail_id == "NA"].index, inplace=True)
df['mail_id'] = pd.to_numeric(df['mail_id'])
logger.log("Dropping NA in email id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

df['ip_id']   = df['ip_id'].str.replace('ip','')
df.drop(df[df.ip_id == "NA"].index, inplace=True)
df['ip_id'] = pd.to_numeric(df['mail_id'])
logger.log("Dropping NA in ip id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

df['card_id'] = df['card_id'].str.replace('card','')
df.drop(df[df.card_id == "NA"].index, inplace=True)
df['card_id'] = pd.to_numeric(df['mail_id'])
logger.log("Dropping NA in card id {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

[2019.05.08-23:01:46] Dropping NA in email id 366
[2019.05.08-23:01:47] Dropping NA in ip id 0
[2019.05.08-23:01:47] Dropping NA in card id 0


In [5]:
df.drop(df[df.simple_journal == "Refused"].index, inplace=True)
logger.log("Dropping REFUSED transaction {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0] 

[2019.05.08-23:01:47] Dropping REFUSED transaction 53318


In [6]:
logger.log("Number of NaNs per column:")
df.isna().sum()

[2019.05.08-23:01:47] Number of NaNs per column:


txid                                0
bookingdate                         0
issuercountrycode                   4
txvariantcode                       0
bin                                 0
amount                              0
currencycode                        0
shoppercountrycode                342
shopperinteraction                  0
simple_journal                      0
cardverificationcodesupplied    13194
cvcresponsecode                     0
creationdate                        0
accountcode                         0
mail_id                             0
ip_id                               0
card_id                             0
dtype: int64

In [7]:
logger.log("Few examples of NaNs rows:")
df[df.isna().any(axis=1)].head()

[2019.05.08-23:01:47] Few examples of NaNs rows:


Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
25297,39506,2015-09-27 20:21:39,MX,mccredit,528843.0,61530.0,MXN,,Ecommerce,Settled,True,0,2015-09-27 20:21:27,MexicoAccount,263004.0,263004.0,263004.0
25592,40058,2015-09-30 06:32:50,,mc,520116.0,119940.0,MXN,MX,Ecommerce,Settled,True,0,2015-09-30 06:32:40,MexicoAccount,47407.0,47407.0,47407.0
27227,43731,2015-10-19 20:40:32,,mc,520116.0,44800.0,MXN,MX,Ecommerce,Settled,True,0,2015-10-19 20:40:16,MexicoAccount,47407.0,47407.0,47407.0
27307,43927,2015-10-20 20:30:51,,mc,520116.0,111920.0,MXN,MX,Ecommerce,Settled,True,0,2015-10-20 20:30:43,MexicoAccount,164167.0,164167.0,164167.0
31536,50220,2015-07-02 04:35:15,AU,mccredit,535316.0,11000.0,AUD,AU,Ecommerce,Settled,,0,2015-07-01 03:45:57,APACAccount,267790.0,267790.0,267790.0


In [8]:
(df[df['cardverificationcodesupplied'].isna()].simple_journal == "Chargeback").sum()

0

In [9]:
df.dropna(axis = 0, inplace = True)
logger.log("Drop the other NaNs {}".format(crt_size - df.shape[0]))
crt_size = df.shape[0]

[2019.05.08-23:01:48] Drop the other NaNs 13526


In [10]:
logger.log("Currencies are {}".format(np.unique(df.currencycode.values)))

[2019.05.08-23:01:48] Currencies are ['AUD' 'GBP' 'MXN' 'NZD' 'SEK']


In [11]:
df['bookingdate']  = pd.to_datetime(df['bookingdate'])
df['creationdate'] = pd.to_datetime(df['creationdate'])
df = df.infer_objects()

In [12]:
df.dtypes

txid                                     int64
bookingdate                     datetime64[ns]
issuercountrycode                       object
txvariantcode                           object
bin                                    float64
amount                                 float64
currencycode                            object
shoppercountrycode                      object
shopperinteraction                      object
simple_journal                          object
cardverificationcodesupplied              bool
cvcresponsecode                          int64
creationdate                    datetime64[ns]
accountcode                             object
mail_id                                float64
ip_id                                  float64
card_id                                float64
dtype: object

In [13]:
df.creationdate.head()

0   2015-07-01 23:03:11
1   2015-07-02 04:50:55
2   2015-07-02 14:30:28
3   2015-07-03 07:53:37
4   2015-07-08 18:35:35
Name: creationdate, dtype: datetime64[ns]

In [14]:
df.sort_values('creationdate', inplace = True)

In [15]:
df.head()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
58673,201445,2015-07-01 22:57:05,GB,visadebit,465944.0,5095.0,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:41,UKAccount,258032.0,258032.0,258032.0
58674,201446,2015-07-01 22:49:20,GB,visadebit,492181.0,4895.0,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:42,UKAccount,75334.0,75334.0,75334.0
58675,201447,2015-07-01 22:57:06,GB,visadebit,446238.0,4695.0,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:48,UKAccount,295389.0,295389.0,295389.0
58676,201449,2015-07-01 22:57:05,GB,visadebit,475130.0,5595.0,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:58,UKAccount,19248.0,19248.0,19248.0
58677,201450,2015-07-01 22:57:06,GB,visadebit,465902.0,6995.0,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:01:11,UKAccount,269209.0,269209.0,269209.0


In [16]:
df.tail()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
58636,96128,2015-11-02 05:07:44,AU,visabusiness,455704.0,5050.0,AUD,AU,Ecommerce,Settled,True,0,2015-10-31 23:57:46,APACAccount,93608.0,93608.0,93608.0
239726,412632,2015-11-03 17:47:38,GB,visadebit,492181.0,8395.0,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:57:52,UKAccount,155984.0,155984.0,155984.0
239727,412633,2015-11-03 17:17:57,GB,visadebit,465859.0,4395.0,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:58:49,UKAccount,228628.0,228628.0,228628.0
239728,412634,2015-11-03 17:17:57,GB,visadebit,476367.0,5895.0,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:58:50,UKAccount,147469.0,147469.0,147469.0
239730,412636,2015-11-04 01:06:37,GB,visadebit,465943.0,4220.0,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:59:12,UKAccount,10313.0,10313.0,10313.0


In [17]:
df.reset_index(drop = True, inplace = True)

In [18]:
conversion = {'AUD': 0.699165, 'GBP': 1.31061, 'MXN': 0.222776586, 'NZD': 0.66152, 'SEK': 0.104405}

df['amount'] = df.apply(lambda e: e['amount'] * conversion[e['currencycode']], axis=1)

In [19]:
df.head()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
0,201445,2015-07-01 22:57:05,GB,visadebit,465944.0,6677.55795,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:41,UKAccount,258032.0,258032.0,258032.0
1,201446,2015-07-01 22:49:20,GB,visadebit,492181.0,6415.43595,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:42,UKAccount,75334.0,75334.0,75334.0
2,201447,2015-07-01 22:57:06,GB,visadebit,446238.0,6153.31395,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:48,UKAccount,295389.0,295389.0,295389.0
3,201449,2015-07-01 22:57:05,GB,visadebit,475130.0,7332.86295,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:00:58,UKAccount,19248.0,19248.0,19248.0
4,201450,2015-07-01 22:57:06,GB,visadebit,465902.0,9167.71695,GBP,GB,Ecommerce,Settled,True,1,2015-07-01 00:01:11,UKAccount,269209.0,269209.0,269209.0


In [20]:
df.tail()

Unnamed: 0,txid,bookingdate,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id
223167,96128,2015-11-02 05:07:44,AU,visabusiness,455704.0,3530.78325,AUD,AU,Ecommerce,Settled,True,0,2015-10-31 23:57:46,APACAccount,93608.0,93608.0,93608.0
223168,412632,2015-11-03 17:47:38,GB,visadebit,492181.0,11002.57095,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:57:52,UKAccount,155984.0,155984.0,155984.0
223169,412633,2015-11-03 17:17:57,GB,visadebit,465859.0,5760.13095,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:58:49,UKAccount,228628.0,228628.0,228628.0
223170,412634,2015-11-03 17:17:57,GB,visadebit,476367.0,7726.04595,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:58:50,UKAccount,147469.0,147469.0,147469.0
223171,412636,2015-11-04 01:06:37,GB,visadebit,465943.0,5530.7742,GBP,GB,Ecommerce,Settled,True,1,2015-10-31 23:59:12,UKAccount,10313.0,10313.0,10313.0


In [21]:
df.drop(["txid", "bookingdate"], inplace = True, axis = 1)
df.replace({'simple_journal': {"Settled": 0, "Chargeback": 1}}, inplace = True)

columns = ["creationdate", "card_id", "mail_id", "ip_id", "issuercountrycode", "txvariantcode", 
           "bin", "shoppercountrycode", "shopperinteraction", "cardverificationcodesupplied", 
           "cvcresponsecode", "accountcode", "amount", "currencycode", "simple_journal"]

df = df[columns]
df.rename(columns = {'simple_journal': 'label'}, inplace = True)

In [22]:
df.mail_id = pd.Categorical(df.mail_id).codes
df.ip_id = pd.Categorical(df.ip_id).codes
df.card_id = pd.Categorical(df.card_id).codes

In [23]:
df.head()

Unnamed: 0,creationdate,card_id,mail_id,ip_id,issuercountrycode,txvariantcode,bin,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,accountcode,amount,currencycode,label
0,2015-07-01 00:00:41,147166,147166,147166,GB,visadebit,465944.0,GB,Ecommerce,True,1,UKAccount,6677.55795,GBP,0
1,2015-07-01 00:00:42,43166,43166,43166,GB,visadebit,492181.0,GB,Ecommerce,True,1,UKAccount,6415.43595,GBP,0
2,2015-07-01 00:00:48,168293,168293,168293,GB,visadebit,446238.0,GB,Ecommerce,True,1,UKAccount,6153.31395,GBP,0
3,2015-07-01 00:00:58,11201,11201,11201,GB,visadebit,475130.0,GB,Ecommerce,True,1,UKAccount,7332.86295,GBP,0
4,2015-07-01 00:01:11,153398,153398,153398,GB,visadebit,465902.0,GB,Ecommerce,True,1,UKAccount,9167.71695,GBP,0


In [24]:
df.columns

Index(['creationdate', 'card_id', 'mail_id', 'ip_id', 'issuercountrycode',
       'txvariantcode', 'bin', 'shoppercountrycode', 'shopperinteraction',
       'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode',
       'amount', 'currencycode', 'label'],
      dtype='object')

In [25]:
numeric_df = df[['amount', 'label', 'cardverificationcodesupplied']]

In [26]:
# from plot_utils import create_heat_map

In [27]:
#corr = numeric_df.corr()
#corr
# create_heat_map('', numeric_df, '', logger)

In [28]:
#from collections import Counter

#occ = Counter(df.card_id.values)


In [29]:
from card_tracker import extract_feature

a = df[0:500]
_hash = {}

features = a.apply(lambda e: extract_feature(e, _hash), axis=1)


In [30]:
from collections import Counter

occ = Counter(df.card_id.values)

occ.most_common()

[(122956, 105),
 (101097, 100),
 (152098, 73),
 (100934, 71),
 (135979, 70),
 (42971, 52),
 (185732, 48),
 (133564, 39),
 (134952, 36),
 (110468, 34),
 (144251, 30),
 (70810, 28),
 (69735, 25),
 (176711, 24),
 (75468, 22),
 (183604, 22),
 (59172, 22),
 (75051, 21),
 (186735, 20),
 (22041, 20),
 (97314, 19),
 (84155, 18),
 (191370, 18),
 (129428, 18),
 (112059, 17),
 (88183, 17),
 (97304, 17),
 (162960, 16),
 (119516, 16),
 (150691, 16),
 (39170, 16),
 (140664, 15),
 (168043, 15),
 (13532, 15),
 (185075, 15),
 (117360, 14),
 (38113, 14),
 (110280, 14),
 (97244, 14),
 (196064, 14),
 (97303, 14),
 (104298, 13),
 (167767, 13),
 (73788, 13),
 (191088, 13),
 (150961, 13),
 (195667, 13),
 (1846, 13),
 (78447, 13),
 (171483, 13),
 (116308, 13),
 (82807, 12),
 (128378, 12),
 (129037, 12),
 (25439, 12),
 (125383, 12),
 (112391, 11),
 (35940, 11),
 (46040, 11),
 (44517, 11),
 (105717, 11),
 (161616, 11),
 (72152, 11),
 (134627, 11),
 (2061, 11),
 (72301, 11),
 (118686, 11),
 (22763, 11),
 (97306,

In [31]:
v = df.loc[df['card_id'] == 122956]

In [32]:
v

Unnamed: 0,creationdate,card_id,mail_id,ip_id,issuercountrycode,txvariantcode,bin,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,accountcode,amount,currencycode,label
143619,2015-09-16 11:11:31,122956,122956,122956,GB,mccredit,556951.0,GB,Ecommerce,True,1,UKAccount,9036.65595,GBP,0
151719,2015-09-21 11:37:46,122956,122956,122956,GB,mccredit,556951.0,GB,Ecommerce,True,1,UKAccount,9036.65595,GBP,0
151729,2015-09-21 11:44:50,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151739,2015-09-21 11:50:49,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151759,2015-09-21 12:02:37,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151776,2015-09-21 12:15:13,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151788,2015-09-21 12:21:13,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151802,2015-09-21 12:26:28,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151834,2015-09-21 12:44:25,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0
151848,2015-09-21 12:52:23,122956,122956,122956,GB,mccredit,556951.0,GB,ContAuth,False,0,UKAccount,9036.65595,GBP,0


In [33]:
_hash = {}

features = v.apply(lambda e: extract_feature(e, _hash), axis=1)

In [34]:
a = features.to_list()
len(a[-1])

9

In [35]:
# f = pd.DataFrame(features)

pd.DataFrame(features.to_list(), columns =[
        "Txn amount over month",
        "Average over 3 months",
        "Average daily over month",
        "Amount same day",
        "Number same day",
        "Amount currency type over month",
        "Number currency type over month",
        "Amount country type over month",
        "Number country type over month",
    ])
#vezi ca acu :) thx

Unnamed: 0,Txn amount over month,Average over 3 months,Average daily over month,Amount same day,Number same day,Amount currency type over month,Number currency type over month,Amount country type over month,Number country type over month
0,0.00000,0.000000,0.000000,0.000000e+00,0,0.00000,0,0.00000,0
1,9036.65595,753.054663,301.221865,0.000000e+00,0,9036.65595,1,9036.65595,1
2,9036.65595,1506.109325,602.443730,9.036656e+03,1,18073.31190,2,18073.31190,2
3,9036.65595,2259.163988,903.665595,1.807331e+04,2,27109.96785,3,27109.96785,3
4,9036.65595,3012.218650,1204.887460,2.710997e+04,3,36146.62380,4,36146.62380,4
5,9036.65595,3765.273312,1506.109325,3.614662e+04,4,45183.27975,5,45183.27975,5
6,9036.65595,4518.327975,1807.331190,4.518328e+04,5,54219.93570,6,54219.93570,6
7,9036.65595,5271.382637,2108.553055,5.421994e+04,6,63256.59165,7,63256.59165,7
8,9036.65595,6024.437300,2409.774920,6.325659e+04,7,72293.24760,8,72293.24760,8
9,9036.65595,6777.491963,2710.996785,7.229325e+04,8,81329.90355,9,81329.90355,9


NameError: name 'f' is not defined