Coupon Redemption for Credit Card Campaign
==============

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [4]:
colors = sns.color_palette("deep")

# Load Data

In [2]:
data_path = '../data/Predicting Coupon Redemption'
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
demo_df = pd.read_csv(os.path.join(data_path, 'customer_demographics.csv'))
cmpn_df = pd.read_csv(os.path.join(data_path, 'campaign_data.csv'))
cp_it_df = pd.read_csv(os.path.join(data_path, 'coupon_item_mapping.csv'))
txn_df = pd.read_csv(os.path.join(data_path, 'customer_transaction_data.csv'))
item_df = pd.read_csv(os.path.join(data_path, 'item_data.csv'))

# Prepare Data

In [11]:
train_df.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [3]:
print("there are {} unique customers".format(train_df['customer_id'].nunique()))
print("there are {} unique campaigns".format(train_df['campaign_id'].nunique()))
print("there are {} unique coupons".format(train_df['coupon_id'].nunique()))

there are 1428 unique customers
there are 18 unique campaigns
there are 866 unique coupons


Combine train_df with demo_df

In [7]:
join_df = train_df.merge(demo_df, on='customer_id', how='left')
join_df.shape

(78369, 11)

Combine with campaign data

In [5]:
cmpn_df['start_date'] = cmpn_df['start_date'].apply(pd.to_datetime)
cmpn_df['end_date'] = cmpn_df['end_date'].apply(pd.to_datetime)
cmpn_df['campaign_duration'] = (cmpn_df['end_date'] - cmpn_df['start_date']).astype('timedelta64[D]').astype(int)

In [8]:
join_df = join_df.merge(cmpn_df, on='campaign_id', how='left')
join_df.shape

(78369, 15)

Combine with coupon and item data

In [9]:
coupon_df = cp_it_df.merge(item_df, on='item_id', how='left')
coupon_df.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,Local,Grocery
1,107,75,56,Local,Grocery
2,494,76,209,Established,Grocery
3,522,77,278,Established,Grocery
4,518,77,278,Established,Grocery


In [10]:
# aggregate by coupon_id
agg_coupon_df = coupon_df.groupby('coupon_id').agg({'item_id': 'unique', 'category': 'unique', 'brand_type': 'unique', 'brand': 'unique'} )
agg_coupon_df['n_items'] = agg_coupon_df['item_id'].str.len()
agg_coupon_df['n_categories'] = agg_coupon_df['category'].str.len()
agg_coupon_df['n_brand_types'] = agg_coupon_df['brand_type'].str.len()
agg_coupon_df['n_brands'] = agg_coupon_df['brand'].str.len()
agg_coupon_df['brand_type'] = agg_coupon_df['brand_type'].apply(np.sort)
agg_coupon_df['brand_type'] = agg_coupon_df['brand_type'].str.join("/")
agg_coupon_df.head()

Unnamed: 0_level_0,item_id,category,brand_type,brand,n_items,n_categories,n_brand_types,n_brands
coupon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,"[60068, 50199, 17091, 44112, 44868, 51028, 592...","[Natural Products, Grocery]",Established,"[4700, 1475, 1558]",39,2,1,3
2,"[2581, 12901]",[Grocery],Established,[2084],2,1,1,1
3,"[58906, 58943, 58944, 58946, 58964, 58972, 590...",[Grocery],Established,"[1558, 278]",17,1,1,2
4,"[36772, 51649, 51212, 51221, 51243, 51598, 528...",[Grocery],Established,[544],24,1,1,1
5,"[57118, 53496, 57079, 46144, 46006, 44994, 57016]",[Pharmaceutical],Established,[5357],7,1,1,1


In [12]:
join_df = join_df.merge(agg_coupon_df, on='coupon_id', how='left')
join_df.shape

(78369, 23)

### Transaction Data

In [72]:
txn_df.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0
