In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)
from tqdm import tqdm
import datetime
from itertools import combinations
from collections import Counter
import itertools
import re

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import Binarizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score , recall_score , f1_score
import lightgbm as lgb
from scipy import stats
from scipy.spatial.distance import mahalanobis

#### Train_Data

In [134]:
campaign_data = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/campaign_data.csv')
coupon_item_mapping = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/coupon_item_mapping.csv')
customer_demographics = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/customer_demographics.csv')
customer_transaction_data = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/customer_transaction_data.csv')
item_data = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/item_data.csv')
train = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/train_AUpWtIz/train.csv')

#### Test_Data

In [127]:
test = pd.read_csv('/Users/s0c02nj/Desktop/AmExpert/test_QyjYwdj.csv')

In [128]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


#### Campaign Data

In [135]:
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,07/09/13,16/11/13
3,23,Y,08/10/13,15/11/13
4,21,Y,16/09/13,18/10/13


In [143]:
campaign_data[campaign_data['duration']<0]

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,strt_wk,strt_mnth,strt_day,end_wk,end_mnth,end_day,duration
6,18,X,2013-10-08,2013-04-10,41,10,8,15,4,10,-181
10,13,X,2013-05-19,2013-05-07,20,5,19,19,5,7,-12
17,6,Y,2013-01-28,2013-01-03,5,1,28,1,1,3,-25
19,5,Y,2013-12-01,2013-02-15,48,12,1,7,2,15,-289
27,26,X,2012-12-08,2012-09-21,49,12,8,38,9,21,-78


In [140]:
campaign_data['start_date'] = pd.to_datetime(campaign_data['start_date'],infer_datetime_format=True)
campaign_data['end_date'] = pd.to_datetime(campaign_data['end_date'],infer_datetime_format=True)

In [141]:
campaign_data['strt_wk'] = campaign_data['start_date'].dt.week
campaign_data['strt_mnth'] = campaign_data['start_date'].dt.month
campaign_data['strt_day'] = campaign_data['start_date'].dt.day

campaign_data['end_wk'] = campaign_data['end_date'].dt.week
campaign_data['end_mnth'] = campaign_data['end_date'].dt.month
campaign_data['end_day'] = campaign_data['end_date'].dt.day

campaign_data['duration'] = campaign_data['end_date'] - campaign_data['start_date']
campaign_data['duration'] = campaign_data['duration'].apply(lambda x: x.days)

In [123]:
campaign_data = campaign_data.drop(['start_date','end_date'],axis=1)

In [71]:
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [49]:
customer_demographics.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [50]:
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


#### Baseline Model

In [70]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [81]:
y = train['redemption_status']
train_x = train.drop(['redemption_status'],axis=1)
test_x = test

df_join = pd.concat([train_x,test_x])

In [83]:
df1 = pd.merge(left = df_join, right = customer_demographics, 
               how='left', left_on=['customer_id'], right_on=['customer_id'])


In [86]:
df1.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,13,27,1053,46-55,,0.0,1.0,,5.0
1,2,13,116,48,36-45,Married,0.0,2.0,,3.0
2,6,9,635,205,46-55,Married,0.0,2.0,,7.0
3,7,13,644,1050,,,,,,
4,9,8,1017,1489,46-55,Married,0.0,2.0,,3.0


In [110]:
df2 = pd.merge(left = df1, right = campaign_data, 
                     how='left', left_on=['campaign_id'], right_on=['campaign_id'])

In [111]:
df2.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,campaign_type,strt_wk,strt_mnth,strt_day,end_wk,end_mnth,end_day,duration
0,1,13,27,1053,46-55,,0.0,1.0,,5.0,X,20,5,19,19,5,7,-12
1,2,13,116,48,36-45,Married,0.0,2.0,,3.0,X,20,5,19,19,5,7,-12
2,6,9,635,205,46-55,Married,0.0,2.0,,7.0,Y,44,11,3,49,12,4,31
3,7,13,644,1050,,,,,,,X,20,5,19,19,5,7,-12
4,9,8,1017,1489,46-55,Married,0.0,2.0,,3.0,X,7,2,16,18,5,4,77


In [68]:
customer_transaction_data[customer_transaction_data['customer_id'] == 1501]

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.00
1,2012-01-02,1501,54253,1,53.43,-13.89,0.00
2,2012-01-02,1501,31962,1,106.50,-14.25,0.00
3,2012-01-02,1501,33647,1,67.32,0.00,0.00
4,2012-01-02,1501,48199,1,71.24,-28.14,0.00
5,2012-01-02,1501,57397,1,71.24,-28.14,0.00
2732,2012-01-13,1501,5267,1,99.38,0.00,0.00
2733,2012-01-13,1501,8420,1,70.88,-21.37,0.00
2734,2012-01-13,1501,8761,1,149.25,0.00,0.00
2735,2012-01-13,1501,11248,1,338.03,0.00,0.00


In [56]:
train['campaign_id'].unique().shape

(18,)

In [55]:
test['campaign_id'].unique().shape

(10,)