In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/data_raw/churn/train.csv")

In [3]:
data.head(5)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.0,C2C_OUT,0,0.0


In [4]:
d = data.groupby(["cl_id"])["currency"]

In [5]:
users = (d.nunique() > 1)

In [6]:
users.rename("has_unq_curr", inplace=True)

cl_id
0        False
1         True
5         True
9        False
10       False
         ...  
10210    False
10212    False
10213    False
10214    False
10215    False
Name: has_unq_curr, Length: 5000, dtype: bool

In [7]:
data = pd.merge(data, users, how = "inner", on="cl_id")

In [8]:
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum,has_unq_curr
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.0,POS,0,0.0,False
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.0,DEPOSIT,0,0.0,False
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.0,POS,0,0.0,False
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.0,POS,0,0.0,False
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.0,C2C_OUT,0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...
490508,01/12/2016,10215,4112,type1,810,17DEC16:00:00:00,2110.9,POS,0,0.0,False
490509,01/12/2016,10215,5411,type1,810,16DEC16:00:00:00,31.0,POS,0,0.0,False
490510,01/12/2016,10215,5411,type1,810,06DEC16:00:00:00,182.0,POS,0,0.0,False
490511,01/12/2016,10215,6011,type1,810,06DEC16:13:39:49,5000.0,DEPOSIT,0,0.0,False


In [9]:
data["TRDATETIME"] = pd.to_datetime(data["TRDATETIME"], format="%d%b%y:%H:%M:%S").dt.date

In [10]:
relevant_users = data["cl_id"][data.has_unq_curr].unique()

In [11]:
churn = pd.read_parquet("../data/preprocessed_new/churn.parquet")

In [12]:
churn_final = pd.concat([churn, data[["currency"]]], join="inner", axis=1)

In [13]:
churn_final.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target,currency
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0,810
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0,810
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0,810
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0,810
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0,810


In [14]:
most_relevant_users = []

for uid in relevant_users:
    transactions = churn_final[churn_final.user_id == uid].copy()
    transactions.sort_values(by="timestamp", inplace=True, kind='stable')

    currencies = transactions["currency"]
    
    most_popular = currencies.value_counts().idxmax()
    second_most = currencies[currencies != most_popular].value_counts().idxmax()

    currencies = np.array(currencies)

    cnt, max_cnt = 0, 0
    for i in range(len(currencies)):
        if currencies[i] == second_most:
            cnt += 1
        else:
            max_cnt = max(cnt, max_cnt)
            cnt = 0

    if max_cnt > 20:
        print(max_cnt, second_most, uid)
        most_relevant_users.append(uid)

24 978 335
21 978 395
21 784 440
27 810 461
34 978 1042
21 978 1263
24 978 1312
23 978 1317
22 978 1416
31 978 1510
34 978 1581
28 978 1683
21 840 1753
41 978 1831
26 352 2373
25 978 2422
26 978 2504
26 978 2710
69 810 3198
29 978 3435
24 978 3562
21 978 3583
24 978 3829
21 975 4113
38 978 4195
27 360 4358
21 348 4480
26 949 4500
35 933 4789
22 392 5043
21 978 5409
40 810 5429
24 978 5476
41 978 5554
24 975 5807
29 978 5812
23 578 5915
31 810 6364
21 978 7580
24 978 8102
31 978 8691
29 484 8787
61 840 8976
37 392 9483
43 978 9493
28 392 9866
31 978 9992
25 978 10163


In [15]:
churn_ = churn_final[churn_final.user_id.isin(most_relevant_users)]

In [21]:
churn_.to_parquet("../data/preprocessed_new/churn_with_currency.parquet")

In [20]:
churn_.user_id.nunique()

48

In [19]:
currencies = churn_[churn_.user_id == 4358].sort_values(by="timestamp", kind="stable")["currency"]

most_popular = currencies.value_counts().idxmax()
second_most = currencies[currencies != most_popular].value_counts().idxmax()

currencies = np.array(currencies)

cnt, max_cnt = 0, 0
for i in range(len(currencies)):
    if currencies[i] == second_most:
        cnt += 1
    else:
        max_cnt = max(cnt, max_cnt)
        cnt = 0
