In [1]:
import pandas as pd
import numpy as np
import xgboost
import datetime
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler

In [2]:
coupons_train=pd.read_csv("Translated_Data/coupon_list_train.csv", parse_dates=["DISPFROM","DISPEND"])
coupons_test = pd.read_csv("Translated_Data/coupon_list_test.csv")

In [3]:
coupons_train["DISPFROM"].fillna(pd.Timestamp("19000101"), inplace=True)
coupons_train = coupons_train.sort(columns=["DISPFROM"]).reset_index(drop=True)

  from ipykernel import kernelapp as app


In [4]:
def gen_validation(coupons_train,time_delta):
    max_date = coupons_train["DISPFROM"].max()
    valid_start = max_date - time_delta
    coupons_valid = coupons_train[(coupons_train["DISPFROM"] > valid_start)]
    coupons_train = coupons_train[~ (coupons_train["DISPFROM"] > valid_start)]
    return coupons_train,coupons_valid

In [5]:
coupons_train,coupons_valid=gen_validation(coupons_train,datetime.timedelta(days=7))

In [6]:
def remove_outliers(coupons_train,coupons_valid):
    if len(coupons_valid)>0:
        very_long_time_display=coupons_valid[coupons_valid.DISPPERIOD > 20].COUPON_ID_hash
        very_low_price = coupons_valid[coupons_valid.DISCOUNT_PRICE <= 100].COUPON_ID_hash
        coupons_valid = coupons_valid[~coupons_valid.COUPON_ID_hash.isin(very_long_time_display)]
        coupons_valid = coupons_valid[~coupons_valid.COUPON_ID_hash.isin(very_low_price)].reset_index(drop=True)
        
    very_long_time_display = coupons_train[coupons_train.DISPPERIOD > 20].COUPON_ID_hash
    coupons_train = coupons_train[~coupons_train.COUPON_ID_hash.isin(very_long_time_display)].reset_index(drop=True)
    
    return coupons_train,coupons_valid

In [7]:
coupons_train,coupons_valid=remove_outliers(coupons_train,coupons_valid)

In [8]:
 def preprocess(df):
        df["REDUCE_PRICE"] = df["CATALOG_PRICE"] - df["DISCOUNT_PRICE"]
        for key in ["DISCOUNT_PRICE", "CATALOG_PRICE", "REDUCE_PRICE"]:
            df[key + "_LOG"] = np.log(df[key] + 1.0).astype(np.float32)

        df["VALIDPERIOD_NA"] = np.array(pd.isnull(df["VALIDPERIOD"]), dtype=np.int32)
        df["DISPPERIOD_C"] = np.array(df["DISPPERIOD"].clip(0, 8), dtype=np.int32)
        df["PRICE_RATE"] = np.array(df.PRICE_RATE, dtype=np.float32)
        df["large_area_name"].fillna("NA", inplace=True)
        df["ken_name"].fillna("NA", inplace=True)
        df["small_area_name"].fillna("NA", inplace=True)
        df["LARGE_AREA_NAME"] = df["large_area_name"]
        df["PREF_NAME"] = df["large_area_name"] + ":" + df["ken_name"]
        df["SMALL_AREA_NAME"] = df["large_area_name"] + ":" + df["ken_name"] + ":" + df["small_area_name"]
        df["CATEGORY_NAME"] = df["CAPSULE_TEXT"] + df["GENRE_NAME"]

        usable_dates = ['USABLE_DATE_MON',
                        'USABLE_DATE_TUE',
                        'USABLE_DATE_WED',
                        'USABLE_DATE_THU',
                        'USABLE_DATE_FRI',
                        'USABLE_DATE_SAT',
                        'USABLE_DATE_SUN',
                        'USABLE_DATE_HOLIDAY',
                        'USABLE_DATE_BEFORE_HOLIDAY']        
        for key in usable_dates:
            df[key].fillna(0, inplace=True)
        df["USABLE_DATE_SUM"] = 0
        for key in usable_dates:
            df["USABLE_DATE_SUM"] += df[key]

        cols = df.columns.tolist()
        cols.remove("DISPFROM")
        cols.remove("DISPEND")
        for key in cols:
            df[key].fillna("NA", inplace=True)
        return df

In [9]:
coupons_train=preprocess(coupons_train)
coupons_valid=preprocess(coupons_valid)
coupons_test=preprocess(coupons_test)

In [10]:
 coupon_mapper = DataFrameMapper([
                ('CATEGORY_NAME', LabelBinarizer()),
                ('PRICE_RATE', None),
                ('CATALOG_PRICE_LOG', None),
                ('DISCOUNT_PRICE_LOG', None),
                ('REDUCE_PRICE_LOG', None),
                ('DISPPERIOD_C', LabelBinarizer()),
                ('VALIDPERIOD_NA', LabelBinarizer()),
                ('USABLE_DATE_SUM', None),
                ('LARGE_AREA_NAME', LabelBinarizer()),
                ('PREF_NAME', LabelBinarizer()),
                ('SMALL_AREA_NAME', LabelBinarizer()),
                ])

In [11]:
coupon_mapper.fit(pd.concat([coupons_train, coupons_valid, coupons_test]))
train_coupon_vec = coupon_mapper.transform(coupons_train.copy())
valid_coupon_vec = coupon_mapper.transform(coupons_valid.copy())
test_coupon_vec = coupon_mapper.transform(coupons_test.copy())

In [12]:
user_frame = pd.read_csv("Translated_Data/user_list.csv")
details_frame = pd.read_csv("Translated_Data/coupon_detail_train.csv",parse_dates=["I_DATE"])
details_frame = details_frame.sort(columns=["I_DATE"]).reset_index(drop=True)

  app.launch_new_instance()


In [14]:
user_mapper = DataFrameMapper([
                ('SEX_ID', LabelBinarizer()),
                ('PREF_NAME', LabelBinarizer()),
                ('AGE', None),
                ])

In [15]:
user_frame["PREF_NAME"].fillna("NA", inplace=True)
user_vec = user_mapper.fit_transform(user_frame.copy())

In [16]:
users = []
coupons_train["ROW_ID"] = pd.Series(coupons_train.index.tolist())
coupons_valid["ROW_ID"] = pd.Series(coupons_valid.index.tolist())

for i, user in user_frame.iterrows():
    coupons = details_frame[details_frame.USER_ID_hash.isin([user["USER_ID_hash"]])]
    train_coupon_data = pd.merge(coupons[["COUPON_ID_hash","ITEM_COUNT","I_DATE"]],
                                         coupons_train,
                                         on="COUPON_ID_hash", how='inner',
                                         suffixes=["_x",""], copy=False)
    train_coupon_data = train_coupon_data.sort(columns=["I_DATE"])
    row_ids = train_coupon_data.ROW_ID.unique().tolist()

    valid_coupon_data = pd.merge(coupons[["COUPON_ID_hash","ITEM_COUNT","I_DATE"]],
                                         coupons_valid, on="COUPON_ID_hash",
                                         how='inner', suffixes=["_x",""], copy=False)
    valid_coupon_data = valid_coupon_data.sort(columns=["I_DATE"])
    valid_row_ids = valid_coupon_data.ROW_ID.unique().tolist()

    users.append({"user": user_vec[i],
                    "coupon_ids": row_ids,
                    "valid_coupon_ids": valid_row_ids})



In [17]:
def maxmin_columns(coupons_train,coupon_ids):
    return coupons_train.ix[
        coupon_ids, ("CATALOG_PRICE","DISCOUNT_PRICE")
        ].as_matrix().astype(np.float32)

In [18]:
def purchase_history_features(train_coupon_vec, user_coupon_vec, maxmin_columns, filter_idx=None):
        sum_vec = np.zeros(2, dtype=np.float32)
        maxmin_vec = np.zeros((4), dtype=np.float32)
        mean_coupon_vec = np.zeros(len(train_coupon_vec[0]), dtype=np.float32)
        
        if filter_idx is not None:
            if len(user_coupon_vec[filter_idx]) > 0:
                mean_coupon_vec[:] = user_coupon_vec[filter_idx].mean(0)
                sum_vec[0] = filter_idx.sum()
                sum_vec[1] = np.log(sum_vec[0] + 1.0)
                max_val = maxmin_columns[filter_idx].max(0)
                min_val = maxmin_columns[filter_idx].min(0)
                maxmin_vec[0] = max_val[0]
                maxmin_vec[1] = min_val[0]
                maxmin_vec[2] = max_val[1]
                maxmin_vec[3] = min_val[1]
        else:
            if len(user_coupon_vec) > 0:
                mean_coupon_vec = user_coupon_vec.mean(0)
                sum_vec[0] = len(user_coupon_vec)
                sum_vec[1] = np.log(sum_vec[0] + 1.0)
                max_val = maxmin_columns.max(0)
                min_val = maxmin_columns.min(0)
                maxmin_vec[0] = max_val[0]
                maxmin_vec[1] = min_val[0]
                maxmin_vec[2] = max_val[1]
                maxmin_vec[3] = min_val[1]

        return np.hstack((mean_coupon_vec, sum_vec, maxmin_vec))

In [19]:
COUPON_DISP_NEAR = 400
COUPON_DISP_NEAR_MIN = 10

In [20]:
def gen_train_data(num_nega=2, verbose=True):
        x = []
        y = []
        for user in users:
            coupon_ids = np.array(user["coupon_ids"], dtype=np.int32)
            user_coupons = train_coupon_vec[coupon_ids]
            maxmin_c = maxmin_columns(coupons_train,coupon_ids)
            for i in xrange(len(user_coupons)):
                target_coupon_vec = user_coupons[i]
                rid = coupon_ids[i]
                nega_list = range(max(0, rid - COUPON_DISP_NEAR), rid)
                if len(nega_list) < COUPON_DISP_NEAR_MIN:
                    continue

                filter_idx = np.ones(user_coupons.shape[0], dtype=np.bool)
                
                # exclude coupons that was purchased after the target coupon
                filter_idx[i:] = False
                # exclude the target coupon (and remove duplicate)
                filter_idx[coupon_ids == coupon_ids[i]] = False
                
                hist_feat = purchase_history_features(train_coupon_vec,user_coupons,
                                                             maxmin_c,
                                                             filter_idx)
                # feature vector (user_feature + purchase_history_feature + coupon_feature)
                purchased_feat = np.hstack((user["user"], hist_feat, target_coupon_vec))
                x.append(purchased_feat)
                y.append([1]) # posi

                # select random unpurchased coupons
                for j in xrange(num_nega):
                    found = False
                    for _ in xrange(10):
                        unpurchased_idx = np.random.choice(nega_list, 1)[0]
                        if unpurchased_idx not in user["coupon_ids"]:
                            found = True
                            break
                    if found:
                        unpurchased_feat = np.hstack((user["user"],
                                                      hist_feat,
                                                      train_coupon_vec[unpurchased_idx]))
                        x.append(unpurchased_feat)
                        y.append([0]) # nega

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.int32)

        return x, y

In [21]:
def gen_valid_data(num_nega=5, verbose=True):
        x = []
        y = []
        for user in users:
            coupon_ids = np.array(user["coupon_ids"], dtype=np.int32)
            user_coupons = train_coupon_vec[coupon_ids]
            valid_coupon_ids=np.array(user["valid_coupon_ids"], dtype=np.int32)
            valid_user_coupons=valid_coupon_vec[valid_coupon_ids]
            maxmin_c = maxmin_columns(coupons_train,coupon_ids)
            hist_feat = purchase_history_features(train_coupon_vec, user_coupons, maxmin_c)
            
            for i in xrange(len(valid_user_coupons)):
                target_coupon_vec = valid_user_coupons[i]
                #rid = coupon_ids[i]
                
                # feature vector (user_feature + purchase_history_feature + coupon_feature)
                purchased_feat = np.hstack((user["user"], hist_feat, target_coupon_vec))
                x.append(purchased_feat)
                y.append(1) # posi

                # select random unpurchased coupons
                for j in xrange(num_nega):
                    found = False
                    for _ in xrange(10):
                        unpurchased_idx = np.random.choice(range(len(valid_coupon_vec)), 1)[0]
                        if unpurchased_idx not in user["valid_coupon_ids"]:
                            found = True
                            break
                    if found:
                        unpurchased_feat = np.hstack((user["user"],
                                                      hist_feat,
                                                      valid_coupon_vec[unpurchased_idx]))
                        x.append(unpurchased_feat)
                        y.append(0) # nega

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.int32)

        return x, y

In [22]:
train_feature,train_label=gen_train_data()

In [23]:
valid_feature,valid_label=gen_valid_data()

In [None]:
dtrain = xgboost.DMatrix(train_feature, label = train_label)
dvalid = xgboost.DMatrix(valid_feature, label = valid_label)
param = {'max_depth':7, 'eta':0.05, 'objective':'binary:logistic', 'subsample':0.9, 'colsample_bytree':0.5, 
         'metric':'auc'}
watchlist  = [(dvalid,'eval'), (dtrain,'train')]
num_rounds=1500
bst = xgboost.train(param, dtrain, num_rounds, watchlist)

In [36]:
user_predictions={}
def predict_for_test() :
    test_user_coupons=test_coupon_vec
    user_id=0
    for user in users:
        coupon_ids = np.array(user["coupon_ids"], dtype=np.int32)
        user_coupons = train_coupon_vec[coupon_ids]
        maxmin_c = maxmin_columns(coupons_train,coupon_ids)
        hist_feat = purchase_history_features(train_coupon_vec, user_coupons, maxmin_c)
        
        user_feat=np.hstack((user["user"],hist_feat))
        user_feat_rep=np.array([list(user_feat)]*len(test_coupon_vec))
        purchased_feats=np.hstack((user_feat_rep,test_coupon_vec))
        
        dtest = xgboost.DMatrix(purchased_feats)
        conf_predicted=bst.predict(dtest)
        
        conf_predicted=[(conf_predicted[i],i) for i in range(len(conf_predicted))]
        conf_predicted.sort()
        top10=[i for (c,i) in conf_predicted[-10:]]
        
        user_predictions[user_frame['USER_ID_hash'][user_id]]=[coupons_test['COUPON_ID_hash'][i] for i in top10]
        user_id+=1
        if user_id%1000==0:
            print user_id

In [37]:
predict_for_test()

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000


In [38]:
print user_predictions[user_predictions.keys()[0]]
print user_predictions[user_predictions.keys()[1]]

['2799a41a862fed399ae7ca699c83f3a3', '3d5c0b4c9e35377c0df5e1e7efe1da42', 'c9e1dcbd8c98f919bf85ab5f2ea30a9d', '42cc500acba3c79883cfd40adcd5ae96', 'f1f00137ca89c6bb32f366ef5f66a001', '2af19a2244a2c2466b87b98e065cdfa7', 'ca8ea3d52ca939d6ab1b9c792baa6169', 'd79a889ee9d0712607a2672e96ba3d69', '27741884a086e2864936d7ef680becc2', '5e47b887e154f746883013f863c3ffe1']
['7ae4e60eab2e4d7e20f88fc19267e87c', '7c9eb3afbf373124bbe21f80b38f413a', '1bad24bc593914e7970f4ae2fb94c203', '625f1962d2c6b55c5f61def56a49dd21', 'd4e29fa02359c30bedc401ea197ce6a2', '529fa18083e92247ea20e877d5b5bb16', '00fcc93438a282f8b915777a209dd0bd', '3810431a7769cfcc3201383b5e83248c', '300d583837219793a0fbb9cb5844bd24', '27741884a086e2864936d7ef680becc2']


In [39]:
with open('output.csv', 'w') as f:
    f.write('USER_ID_hash,PURCHASED_COUPONS\n')
    for user in user_predictions.keys():
        f.write(user+','+' '.join(user_predictions[user])+'\n')

In [147]:
for user in user_predictions.keys():
    base=user_predictions[user]
    break
for user in user_predictions.keys():
    if user_predictions[user]!=base:
        print "Mismatch"

In [161]:
uf=user_frame.copy()
uf.sort(columns=['USER_ID_hash'])
uf['USER_ID_hash'].sort()
#uf=user_frame['USER_ID_hash']
#uf.sort(user_frame.copy())

  from ipykernel import kernelapp as app
  app.launch_new_instance()


ValueError: This Series is a view of some other array, to sort in-place you must create a copy