In [22]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
colors = sns.color_palette("deep")

## Load Data

In [3]:
data_path = '../data/Predicting Coupon Redemption/train'
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
demo_df = pd.read_csv(os.path.join(data_path, 'customer_demographics.csv'))
cmpn_df = pd.read_csv(os.path.join(data_path, 'campaign_data.csv'))
cp_it_df = pd.read_csv(os.path.join(data_path, 'coupon_item_mapping.csv'))
txn_df = pd.read_csv(os.path.join(data_path, 'customer_transaction_data.csv'))
item_df = pd.read_csv(os.path.join(data_path, 'item_data.csv'))

## EDA: Customer Transactions

In [43]:
print(f'total customers are {txn_df["customer_id"].nunique()}')

total customers are 1582


In [4]:
print(f'total transaction before join: {txn_df.shape[0]}')
txn_df = txn_df.merge(item_df, on='item_id', how='left')
print(f'total transaction after join {txn_df.shape[0]}')

total transaction before join: 1324566
total transaction after join 1324566


In [5]:
txn_df.head(10)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,brand,brand_type,category
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,56,Local,Natural Products
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,56,Local,Natural Products
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,524,Established,Grocery
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,1134,Established,Grocery
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,524,Established,Grocery
5,2012-01-02,1501,57397,1,71.24,-28.14,0.0,524,Established,Grocery
6,2012-01-02,857,12424,1,106.5,-14.25,0.0,971,Established,Grocery
7,2012-01-02,857,14930,1,110.07,0.0,0.0,3235,Established,Meat
8,2012-01-02,857,16657,1,89.05,-35.26,0.0,2011,Established,Packaged Meat
9,2012-01-02,67,10537,3,32.06,0.0,0.0,487,Established,Grocery


In [6]:
print('drop duplicates')
txn_df.drop_duplicates(subset=['date', 'customer_id', 'item_id', 'quantity', 'selling_price', 'other_discount', 'coupon_discount'], inplace=True)
count_dup = txn_df.duplicated(subset=['date', 'customer_id', 'item_id', 'quantity', 'selling_price', 'other_discount', 'coupon_discount']).sum()
print('count duplicates: ', count_dup)

drop duplicates
count duplicates:  0


In [81]:
txn_df['txn_month'] = txn_df['date'].str[:7]
txn_df.groupby('txn_month').size()

txn_month
2012-01    12368
2012-02    25037
2012-03    46642
2012-04    68232
2012-05    78929
2012-06    78946
2012-07    80906
2012-08    83861
2012-09    85175
2012-10    86512
2012-11    86480
2012-12    87960
2013-01    86113
2013-02    78454
2013-03    85673
2013-04    78854
2013-05    85726
2013-06    79415
2013-07     6367
dtype: int64

In [21]:
txn_df[txn_df['customer_id'] == 1501]

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,brand,brand_type,category,txn_month
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,2012-01
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,56,Local,Natural Products,2012-01
2,2012-01-02,1501,31962,1,106.50,-14.25,0.0,524,Established,Grocery,2012-01
3,2012-01-02,1501,33647,1,67.32,0.00,0.0,1134,Established,Grocery,2012-01
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,524,Established,Grocery,2012-01
...,...,...,...,...,...,...,...,...,...,...,...
1312626,2013-06-29,1501,61261,1,76.94,0.00,0.0,1025,Established,Pharmaceutical,2013-06
1320415,2013-07-02,1501,10044,2,134.64,-128.23,0.0,56,Local,Grocery,2013-07
1320416,2013-07-02,1501,13467,1,95.82,-5.34,0.0,1075,Established,Grocery,2013-07
1320417,2013-07-02,1501,26854,1,35.62,-21.02,0.0,89,Established,Grocery,2013-07


## Sum Txn Selling Price by Category for Each Customer

In [74]:
pv_df = pd.pivot_table(txn_df, index=['customer_id', 'txn_month'], 
                       columns=['category'], 
                       values=['selling_price'],
                       aggfunc='sum'
                      )
pv_df.columns = [re.sub(r'[^\w]', '', '_'.join(col)) for col in pv_df.columns.values]
pv_df.columns = [col.replace('selling_price_', '') for col in pv_df.columns.values]

In [75]:
pv_df.reset_index(inplace=True)

In [76]:
pv_df.head()

Unnamed: 0,customer_id,txn_month,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,...,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
0,1,2012-02,,284.24,,,,,1750.01,,...,,462.34,106.86,,,,,,,
1,1,2012-03,,,,,,,1144.46,,...,,89.05,765.11,124.31,,,,,,
2,1,2012-04,,284.6,,,,,3692.71,134.29,...,,174.18,769.4,102.94,,,,,,
3,1,2012-05,,248.62,,,,,5806.74,106.5,...,152.81,401.79,1634.6,,,,,,,
4,1,2012-06,,373.29,,,,,5873.72,,...,195.19,462.34,658.96,,,,,,,


## Cateogory Ranking

In [77]:
rank_cols = pv_df.columns[2:]

In [78]:
rank_df = pv_df[rank_cols].rank(axis=1, ascending=False)
rank_df = pd.concat([pv_df[['customer_id', 'txn_month']], rank_df], axis=1)

In [79]:
rank_df.describe()

Unnamed: 0,customer_id,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,Miscellaneous,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
count,24799.0,817.0,8731.0,5884.0,1238.0,6151.0,291.0,24326.0,11013.0,1837.0,15087.0,16508.0,20492.0,6387.0,118.0,321.0,4307.0,2730.0,219.0,86.0
mean,788.664986,4.304162,5.306838,5.192726,5.348142,2.321167,5.286942,1.1595,4.294289,4.635275,4.602141,3.676854,2.810023,5.271098,6.063559,7.05296,5.245879,5.728755,6.623288,7.424419
std,456.721712,2.063088,2.02302,2.186698,2.203006,1.041675,2.378634,0.445008,1.737213,2.701885,1.564462,1.542956,1.344423,1.995114,2.449094,2.227737,1.977205,2.16935,2.159013,2.339579
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0
25%,394.0,3.0,4.0,3.0,4.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0,2.0,4.0,4.0,6.0,4.0,4.0,5.0,6.0
50%,787.0,4.0,5.0,5.0,5.0,2.0,5.0,1.0,4.0,4.0,4.5,3.0,2.0,5.0,6.0,7.0,5.0,6.0,7.0,8.0
75%,1185.0,6.0,7.0,7.0,7.0,3.0,7.0,1.0,5.0,7.0,6.0,4.0,3.0,7.0,8.0,9.0,7.0,7.0,8.0,9.0
max,1582.0,11.0,13.0,14.0,13.0,10.0,12.0,6.0,12.0,13.0,12.0,12.0,10.0,13.0,13.0,13.0,15.0,13.0,12.0,14.0


In [80]:
rank_df.head()

Unnamed: 0,customer_id,txn_month,Alcohol,Bakery,DairyJuicesSnacks,FlowersPlants,Fuel,Garden,Grocery,Meat,...,NaturalProducts,PackagedMeat,Pharmaceutical,PreparedFood,Restauarant,Salads,Seafood,SkinHairCare,Travel,Vegetablescut
0,1,2012-02,,3.0,,,,,1.0,,...,,2.0,4.0,,,,,,,
1,1,2012-03,,,,,,,1.0,,...,,4.0,2.0,3.0,,,,,,
2,1,2012-04,,3.0,,,,,1.0,5.0,...,,4.0,2.0,6.0,,,,,,
3,1,2012-05,,4.0,,,,,1.0,6.0,...,5.0,3.0,2.0,,,,,,,
4,1,2012-06,,4.0,,,,,1.0,,...,5.0,3.0,2.0,,,,,,,


### Define Label

In [88]:
label_months = [month for month in pv_df['txn_month'].unique() if month not in ['2012-01', '2012-02', '2012-03']]

In [90]:
label_months.sort()