In [1]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
from sklearn import preprocessing

In [2]:
df = pd.read_csv('data/historical_transactions.csv')

In [3]:
df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [28]:
clean = df[['card_id', 'merchant_id', 'purchase_date']].sort_values(by = 'purchase_date', ascending=True)

In [29]:
clean.head()

Unnamed: 0,card_id,merchant_id,purchase_date
7289521,C_ID_da2090f28e,M_ID_f001319a61,2017-01-01 00:00:08
18512762,C_ID_efced389a0,M_ID_18038b5ae7,2017-01-01 00:00:59
14942234,C_ID_83561fe74a,M_ID_52d3026407,2017-01-01 00:01:41
28659693,C_ID_479fd6392a,M_ID_e5374dabc0,2017-01-01 00:02:03
20004812,C_ID_1cf6056088,M_ID_2cf6dc1f6f,2017-01-01 00:02:12


In [30]:
clean.groupby(['card_id'])['purchase_date'].count().sort_values(ascending=False)

card_id
C_ID_3d3dfdc692    5582
C_ID_0cd2ce025c    2912
C_ID_cc3d4cd4e3    2143
C_ID_5ccc07beb9    2066
C_ID_5ea401d358    1786
                   ... 
C_ID_22df161ffe       2
C_ID_edc21e1571       2
C_ID_1e3c6b4f44       2
C_ID_112832329d       2
C_ID_cb5d101ac5       2
Name: purchase_date, Length: 325540, dtype: int64

In [31]:
clean.groupby(['card_id'])['purchase_date'].count().describe()

count    325540.000000
mean         89.427907
std         105.279414
min           2.000000
25%          26.000000
50%          55.000000
75%         111.000000
max        5582.000000
Name: purchase_date, dtype: float64

## Notes
It can be seen that mean number of purchases per card is 89 with a standard deviation of 105. The minimum is 2 and maximum is 5582. Given that the CoSeRec has a maximum sequence length cutoff of 50 we may have to cut down on the sequence size for each user.

Next steps will be to order the sequences for each card by date and build the dataset with card_id as the first column which will just be row_id and then the sequence of merchant_id pairs per that card id. 

In [32]:
label_encoder = preprocessing.LabelEncoder()  # convert the merchant and card ids to integers
clean['merchant_id']= label_encoder.fit_transform(clean['merchant_id'])
clean['card_id']= label_encoder.fit_transform(clean['card_id'])

In [33]:
clean.head()

Unnamed: 0,card_id,merchant_id,purchase_date
7289521,277240,305960,2017-01-01 00:00:08
18512762,304855,30671,2017-01-01 00:00:59
14942234,167005,105431,2017-01-01 00:01:41
28659693,91256,292163,2017-01-01 00:02:03
20004812,36808,57235,2017-01-01 00:02:12


In [39]:
groups = clean.groupby(['card_id']).merchant_id.apply(list)

In [40]:
groups # it is in order because we ordered clean by date previously

card_id
0         [133994, 207972, 188773, 207972, 188773, 18877...
1         [266979, 266979, 266979, 222529, 151415, 23137...
2         [116232, 188224, 213956, 87420, 204645, 189334...
3         [156200, 307907, 284001, 306432, 211913, 52241...
4         [153460, 153460, 147101, 310040, 310040, 31004...
                                ...                        
325535    [100952, 258509, 258509, 258509, 835, 148296, ...
325536    [306830, 192027, 153456, 302096, 106343, 86454...
325537    [80210, 3049, 3049, 3049, 3049, 3049, 164264, ...
325538    [290998, 81700, 232707, 185965, 26664, 39748, ...
325539    [152456, 71504, 109177, 86378, 152456, 87025, ...
Name: merchant_id, Length: 325540, dtype: object

In [41]:
file = open('data/card_purchases.txt', 'w')
writer = csv.writer(file)
max_seq_len = 50
for k in range(len(groups)):
    d = groups[k][:max_seq_len] # cut the sequence length
    d.insert(0, k) # this is not ideal because it's not immutable and it messes with groups
    writer.writerow(d)
file.close()