In [166]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.metrics import ndcg_score



## Load Data

In [4]:
data_path = '../data/Predicting Coupon Redemption/train'
txn_df = pd.read_csv(os.path.join(data_path, 'customer_transaction_data.csv'))
item_df = pd.read_csv(os.path.join(data_path, 'item_data.csv'))

In [5]:
txn_df = txn_df.merge(item_df, on='item_id', how='left')

## Cleaning Data

In [6]:
count_dup = txn_df.duplicated(subset=['date', 'customer_id', 'item_id', 'quantity', 'selling_price', 'other_discount', 'coupon_discount']).sum()
print('count duplicates: ', count_dup)

print('drop duplicates')
txn_df.drop_duplicates(subset=['date', 'customer_id', 'item_id', 'quantity', 'selling_price', 'other_discount', 'coupon_discount'], inplace=True)

count_dup = txn_df.duplicated(subset=['date', 'customer_id', 'item_id', 'quantity', 'selling_price', 'other_discount', 'coupon_discount']).sum()
print('count duplicates: ', count_dup)

count duplicates:  2916
drop duplicates
count duplicates:  0


In [324]:
txn_df['txn_month'] = txn_df['date'].str[:7]
txn_df.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,brand,brand_type,category,txn_month
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,2012-01
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,56,Local,Natural Products,2012-01
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,524,Established,Grocery,2012-01
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,1134,Established,Grocery,2012-01
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,524,Established,Grocery,2012-01


In [8]:
pv_df = pd.pivot_table(txn_df, index=['customer_id', 'txn_month'], 
                       columns=['category'], 
                       values=['selling_price'],
                       aggfunc='sum'
                      )
pv_df.columns = [re.sub(r'[^\w]', '', '_'.join(col)) for col in pv_df.columns.values]
pv_df.columns = [col.replace('selling_price_', '') for col in pv_df.columns.values]
pv_df.reset_index(inplace=True)

In [327]:
pv_df.head()

Unnamed: 0,customer_id,txn_month,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,...,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
0,1,2012-02,,284.24,,,,,1750.01,,...,,462.34,106.86,,,,,,,
1,1,2012-03,,,,,,,1144.46,,...,,89.05,765.11,124.31,,,,,,
2,1,2012-04,,284.6,,,,,3692.71,134.29,...,,174.18,769.4,102.94,,,,,,
3,1,2012-05,,248.62,,,,,5806.74,106.5,...,152.81,401.79,1634.6,,,,,,,
4,1,2012-06,,373.29,,,,,5873.72,,...,195.19,462.34,658.96,,,,,,,


## Test first cohort

In [170]:
cust_ids = txn_df['customer_id'].unique()
cat_cols = pv_df.columns[2:]

In [171]:
cohorts = txn_df['txn_month'].unique()
cohorts.sort()
cohorts

array(['2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06',
       '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12',
       '2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
       '2013-07'], dtype=object)

In [172]:
## Cohort1: Train data ['2012-01', '2012-02', '2012-03'], Validation data ['2012-04']
label_month = '2012-04'
label_df = pv_df[pv_df.txn_month == label_month].copy()
label_df.drop('txn_month', axis=1, inplace=True)
label_df.fillna(0, inplace=True)
label_df.head()


Unnamed: 0,customer_id,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,Miscellaneous,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
2,1,0.0,284.6,0.0,0.0,0.0,0.0,3692.71,134.29,0.0,0.0,174.18,769.4,102.94,0.0,0.0,0.0,0.0,0.0,0.0
17,2,0.0,0.0,0.0,0.0,0.0,0.0,882.3,397.16,0.0,156.37,206.24,101.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,3,0.0,0.0,0.0,0.0,0.0,0.0,625.84,0.0,0.0,35.26,522.19,0.0,0.0,0.0,0.0,213.36,0.0,0.0,0.0
46,4,0.0,0.0,0.0,0.0,0.0,0.0,3510.71,0.0,0.0,60.2,59.49,142.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,5,0.0,0.0,0.0,0.0,0.0,0.0,998.05,0.0,0.0,106.86,117.19,347.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
train_months = ['2012-01', '2012-02', '2012-03']
history_df = pv_df[pv_df.txn_month.isin(train_months)].copy()
history_df = history_df.groupby('customer_id').mean()
history_df.reset_index(inplace=True)
history_df[cat_cols] = history_df[cat_cols].fillna(0)
history_df.head()

Unnamed: 0,customer_id,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,Miscellaneous,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
0,1,0.0,284.24,0.0,0.0,0.0,0.0,1447.235,0.0,0.0,0.0,275.695,435.985,124.31,0.0,0.0,0.0,0.0,0.0,0.0
1,6,0.0,0.0,0.0,0.0,0.0,0.0,2264.36,248.98,0.0,102.94,115.405,191.105,355.48,0.0,0.0,0.0,0.0,0.0,0.0
2,7,0.0,0.0,0.0,0.0,1601.48,0.0,4085.94,425.3,0.0,208.38,1151.24,2026.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,0.0,0.0,0.0,0.0,0.0,0.0,1772.93,0.0,0.0,0.0,721.65,166.226667,0.0,0.0,0.0,114.34,0.0,0.0,0.0
4,11,0.0,0.0,0.0,0.0,943.93,0.0,2826.693333,694.59,0.0,167.17,106.5,4113.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [174]:
all_df = history_df[cat_cols].mean()

In [131]:
# rank_df = history_df[cat_cols].rank(axis=1, method='dense', ascending=False)
# predict_df = pd.concat([history_df[['customer_id']], rank_df], axis=1)
# predict_df.head()

### Evaluate Ranking

In [182]:
# fill in missing customer id
predict_ids = list(history_df['customer_id'].unique())
label_ids = list(label_df['customer_id'].unique())

print('history shape', len(predict_ids))
print('label shape', len(label_ids))

missing_ids = [cust_id for cust_id in label_ids if cust_id not in predict_ids]
print(f'total missing ids from history data are {len(missing_ids)}')

intersect_ids = set(predict_ids).intersection(set(label_ids))
print(f'total intersection ids {len(intersect_ids)}')


history shape 1110
label shape 1463
total missing ids from history data are 459
total intersection ids 1004


In [176]:
valid_ids = list(intersect_ids)
valid_ids.sort()

In [201]:
# customer with no history use average data of all customers 
data = [all_df[cat_cols].tolist()]*len(missing_ids)

tmp_df = pd.DataFrame(missing_ids, columns=['customer_id'])
tmp_df[cat_cols] = data

history_df = history_df.append(tmp_df)

In [202]:
history_df.shape

(1569, 20)

In [208]:
valid_ids = label_ids
valid_ids.sort()

predict_df = history_df.loc[history_df.customer_id.isin(valid_ids), :].sort_values(by=['customer_id'])
label_df = label_df.loc[label_df.customer_id.isin(valid_ids), :].sort_values(by=['customer_id'])

predict_df['customer_id'].tolist()==label_df['customer_id'].tolist()

True

In [209]:
print('label shape', label_df.shape)

label shape (1463, 20)


In [211]:
ndcg_score(label_df[cat_cols], predict_df[cat_cols], k=5)

0.8865109586911615

## Run for all cohorts

In [266]:
all_predict_df = pd.DataFrame()
all_label_df = pd.DataFrame()
for i in range(3, len(cohorts)):
    train_months =  cohorts[i-3:i]
    label_month = cohorts[i]
    print(f'train data {train_months}, label data {label_month}')
    
    # prep label
    label_df = pv_df[pv_df.txn_month == label_month].copy()
    label_df.drop('txn_month', axis=1, inplace=True)
    label_df.fillna(0, inplace=True)

    # prep history data
    history_df = pv_df[pv_df.txn_month.isin(train_months)].copy()
    history_df = history_df.groupby('customer_id').mean()
    history_df.reset_index(inplace=True)
    history_df[cat_cols] = history_df[cat_cols].fillna(0)
    
    
    predict_ids = list(history_df['customer_id'].unique())
    label_ids = list(label_df['customer_id'].unique())
    missing_ids = [cust_id for cust_id in label_ids if cust_id not in predict_ids]

    # prepare default data for customer with no transaction
    all_df = history_df[cat_cols].mean()
    data = [all_df[cat_cols].tolist()]*len(missing_ids)
    tmp_df = pd.DataFrame(missing_ids, columns=['customer_id'])
    tmp_df[cat_cols] = data

    history_df = history_df.append(tmp_df)
    
    # intersect_ids = set(predict_ids).intersection(set(label_ids))
    # valid_ids = list(intersect_ids)
    # valid_ids.sort()
    
    valid_ids = label_ids
    valid_ids.sort()
    
    predict_df = history_df.loc[history_df.customer_id.isin(valid_ids), :].sort_values(by=['customer_id']).copy()
    label_df = label_df.loc[label_df.customer_id.isin(valid_ids), :].sort_values(by=['customer_id']).copy()
    
    predict_df.loc[:, 'txn_month'] = label_month
    label_df.loc[:, 'txn_month'] = label_month
    if not predict_df['customer_id'].tolist()==label_df['customer_id'].tolist():
        print('error different ids')
    # print('ndcg_score:', ndcg_score(label_df[cat_cols], predict_df[cat_cols], k=3))
    # print('---------------------------------------------------')
    all_predict_df = all_predict_df.append(predict_df)
    all_label_df = all_label_df.append(label_df)
all_label_df.reset_index(inplace=True, drop=True)
all_predict_df.reset_index(inplace=True, drop=True)

train data ['2012-01' '2012-02' '2012-03'], label data 2012-04
train data ['2012-02' '2012-03' '2012-04'], label data 2012-05
train data ['2012-03' '2012-04' '2012-05'], label data 2012-06
train data ['2012-04' '2012-05' '2012-06'], label data 2012-07
train data ['2012-05' '2012-06' '2012-07'], label data 2012-08
train data ['2012-06' '2012-07' '2012-08'], label data 2012-09
train data ['2012-07' '2012-08' '2012-09'], label data 2012-10
train data ['2012-08' '2012-09' '2012-10'], label data 2012-11
train data ['2012-09' '2012-10' '2012-11'], label data 2012-12
train data ['2012-10' '2012-11' '2012-12'], label data 2013-01
train data ['2012-11' '2012-12' '2013-01'], label data 2013-02
train data ['2012-12' '2013-01' '2013-02'], label data 2013-03
train data ['2013-01' '2013-02' '2013-03'], label data 2013-04
train data ['2013-02' '2013-03' '2013-04'], label data 2013-05
train data ['2013-03' '2013-04' '2013-05'], label data 2013-06
train data ['2013-04' '2013-05' '2013-06'], label data 

In [270]:
all_label_df.replace(0, np.nan, inplace=True)
all_predict_df.replace(0, np.nan, inplace=True)

In [273]:
all_label_df

Unnamed: 0,customer_id,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,Miscellaneous,...,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut,txn_month
0,1,,284.6,,,,,3692.71,134.29,,...,174.18,769.40,102.94,,,,,,,2012-04
1,2,,,,,,,882.30,397.16,,...,206.24,101.87,,,,,,,,2012-04
2,3,,,,,,,625.84,,,...,522.19,,,,,213.36,,,,2012-04
3,4,,,,,,,3510.71,,,...,59.49,142.12,,,,,,,,2012-04
4,5,,,,,,,998.05,,,...,117.19,347.30,,,,,,,,2012-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22739,1558,,,,,,,676.78,,,...,71.24,,177.74,,,,,,,2013-07
22740,1560,,,,,,,32.06,,,...,213.36,61.98,,,,,,,,2013-07
22741,1567,,,,,712.4,,,,,...,,,,,,,,,,2013-07
22742,1579,,,,,,,2610.94,,,...,426.37,126.09,,,,,,,,2013-07


In [310]:
len(cat_cols)

19

In [312]:
all_label_rank = all_label_df[cat_cols].rank(axis=1, ascending=True, na_option='keep')
all_predict_rank = all_predict_df[cat_cols].rank(axis=1, ascending=True,  na_option='keep')


In [314]:
all_label_rank.fillna(0, inplace=True)
all_predict_rank.fillna(0, inplace=True)

In [315]:
print('total prediction:', all_label_rank.shape)
print('total label:', all_predict_rank.shape)

print('ndcg_score: ', ndcg_score(all_label_rank[cat_cols], all_predict_rank[cat_cols], k=5))

total prediction: (22744, 19)
total label: (22744, 19)
ndcg_score:  0.8441367929326399


In [322]:
label_top5_cats = all_label_rank[cat_cols].apply(lambda s: s.abs().nlargest(5).index.tolist(), axis=1)
pred_top5_cats = all_predict_rank[cat_cols].apply(lambda s: s.abs().nlargest(5).index.tolist(), axis=1)