In [23]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm_notebook as tqdm
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from matplotlib import pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
from scipy.special import logit

#train, test, campaign_data, coupon_item_mapping, customer_demographics, customer_transaction_data,item_data,submission,


train=pd.read_csv("train.csv")
test=pd.read_csv("test_QyjYwdj.csv")
campaign_data=pd.read_csv("campaign_data.csv")
coupon_item_mapping=pd.read_csv("coupon_item_mapping.csv")
customer_demographics=pd.read_csv("customer_demographics.csv")
customer_transaction_data=pd.read_csv("customer_transaction_data.csv")
item_data=pd.read_csv("item_data.csv")
submission=pd.read_csv("sample_submission_Byiv0dS.csv")

In [24]:
data = pd.concat([train, test], sort=False).reset_index(drop = True)
print ("Train_length:", len(train))

Train_length: 78369


In [25]:
data = data.merge(campaign_data, on='campaign_id')#  campaign_data
data['start_date'] = pd.to_datetime(data['start_date'], dayfirst=True)
data['end_date'] = pd.to_datetime(data['end_date'], dayfirst=True)
data['campaign_type'] = pd.Series(data['campaign_type'].factorize()[0]).replace(-1, np.nan)

In [26]:
# customer_demographics
# customer_demographics
customer_demographics['no_of_children'] = customer_demographics['no_of_children'].replace('3+', 3).astype(float)
customer_demographics['family_size'] = customer_demographics['family_size'].replace('5+', 3).astype(float)
customer_demographics['marital_status'] = pd.Series(customer_demographics['marital_status'].factorize()[0]).replace(-1, np.nan)
customer_demographics['age_range'] = pd.Series(customer_demographics['age_range'].factorize()[0]).replace(-1, np.nan)

In [27]:
# rented
rented_mean = customer_demographics.groupby("customer_id")['rented'].mean().to_dict()
data['rented_mean'] = data['customer_id'].map(rented_mean)
# income_bracket
income_bracket_sum = customer_demographics.groupby("customer_id")['income_bracket'].sum().to_dict()
data['income_bracket_sum'] = data['customer_id'].map(income_bracket_sum)
# age_range
age_range_mean = customer_demographics.groupby("customer_id")['age_range'].mean().to_dict()
data['age_range_mean'] = data['customer_id'].map(age_range_mean)
# family_size
family_size_mean = customer_demographics.groupby("customer_id")['family_size'].mean().to_dict()
data['family_size_mean'] = data['customer_id'].map(family_size_mean)
# no_of_children
no_of_children_mean = customer_demographics.groupby("customer_id")['no_of_children'].mean().to_dict()
data['no_of_children_mean'] = data['customer_id'].map(no_of_children_mean)
no_of_children_count = customer_demographics.groupby("customer_id")['no_of_children'].count().to_dict()
data['no_of_children_count'] = data['customer_id'].map(no_of_children_count)
# marital_status
marital_status_count = customer_demographics.groupby("customer_id")['marital_status'].count().to_dict()
data['marital_status_count'] = data['customer_id'].map(marital_status_count)

In [28]:
#############################################################################
# customer_transaction_data
customer_transaction_data['date'] = pd.to_datetime(customer_transaction_data['date'])
# quantity	
quantity_mean = customer_transaction_data.groupby("customer_id")['quantity'].mean().to_dict()
data['quantity_mean'] = data['customer_id'].map(quantity_mean)
#coupon_discount
coupon_discount_mean = customer_transaction_data.groupby("customer_id")['coupon_discount'].mean().to_dict()
data['coupon_discount_mean'] = data['customer_id'].map(coupon_discount_mean)
# other_discount
other_discount_mean = customer_transaction_data.groupby("customer_id")['other_discount'].mean().to_dict()
data['other_discount_mean'] = data['customer_id'].map(other_discount_mean)
# day
customer_transaction_data['day'] = customer_transaction_data.date.dt.day
date_day_mean = customer_transaction_data.groupby("customer_id")['day'].mean().to_dict()
data['date_day_mean'] = data['customer_id'].map(date_day_mean)

In [29]:
#coupon_item_mapping, item_data
coupon_item_mapping = coupon_item_mapping.merge(item_data, how = 'left', on = 'item_id')
coupon_item_mapping['brand_type'] = pd.Series(coupon_item_mapping['brand_type'].factorize()[0]).replace(-1, np.nan)
coupon_item_mapping['category'] = pd.Series(coupon_item_mapping['category'].factorize()[0]).replace(-1, np.nan)

In [30]:
category = coupon_item_mapping.groupby("coupon_id")['category'].mean().to_dict()
data['category_mean'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].count().to_dict()
data['category_count'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].nunique().to_dict()
data['category_nunique'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].max().to_dict()
data['category_max'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].min().to_dict()
data['category_min'] = data['coupon_id'].map(category)

brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].mean().to_dict()
data['brand_mean'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].count().to_dict()
data['brand_count'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].min().to_dict()
data['brand_min'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].max().to_dict()
data['brand_max'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].nunique().to_dict()
data['brand_nunique'] = data['coupon_id'].map(brand_mean)

In [31]:
# selling_price
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].mean().to_dict()
data['selling_price_mean'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].sum().to_dict()
data['selling_price_sum'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].min().to_dict()
data['selling_price_min'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].max().to_dict()
data['selling_price_max'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].nunique().to_dict()
data['selling_price_nunique'] = data['customer_id'].map(selling_price_mean)
train_cols = [i for i in data.columns if i not in ['id','redemption_status','start_date','end_date']]
train_cols = ['campaign_id','coupon_id','campaign_type','rented_mean','income_bracket_sum','age_range_mean','family_size_mean',
 'no_of_children_mean',
 'no_of_children_count',
 'marital_status_count',
 'quantity_mean',
 'coupon_discount_mean',
 'other_discount_mean',
 'date_day_mean',
 'category_mean',
 'category_nunique',
 'category_max',
 'category_min',
 'brand_mean',
 'brand_max',
 'brand_nunique',
 'selling_price_mean',
 'selling_price_min',
 'selling_price_nunique']
train = data[data['redemption_status'].notnull()]
test = data[data['redemption_status'].isnull()]

In [32]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date,rented_mean,income_bracket_sum,...,brand_mean,brand_count,brand_min,brand_max,brand_nunique,selling_price_mean,selling_price_sum,selling_price_min,selling_price_max,selling_price_nunique
0,1,13,27,1053,0.0,0,2013-05-19,2013-07-05,0.0,5.0,...,1364.128,125,1105,1636,2,184.260484,57120.75,17.45,5164.54,129
1,2,13,116,48,0.0,0,2013-05-19,2013-07-05,0.0,3.0,...,56.0,3,56,56,1,234.247013,90185.1,7.12,1758.92,114
2,7,13,644,1050,0.0,0,2013-05-19,2013-07-05,,,...,611.0,4,611,611,1,98.276034,23291.42,13.89,708.48,84
3,21,13,1028,89,0.0,0,2013-05-19,2013-07-05,0.0,3.0,...,1639.0,6,1639,1639,1,115.576332,77204.99,10.33,1246.7,186
4,23,13,517,1067,0.0,0,2013-05-19,2013-07-05,0.0,5.0,...,261.0,3,261,261,1,115.829742,112354.85,3.56,1905.31,186


In [33]:
test.shape

(50226, 34)

In [34]:
submission.shape

(50226, 2)

In [36]:
# Model
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc
from catboost import Pool, CatBoostClassifier
from category_encoders import TargetEncoder
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model', n_folds=5):
    kf = StratifiedKFold(n_splits=n_folds, shuffle = True, random_state = 228)
    fold_splits = kf.split(train, target)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = test.columns
    i = 1
    for dev_index, val_index in fold_splits:
        print('-------------------------------------------')
        print('Started ' + label + ' fold ' + str(i) + f'/{n_folds}')
        dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
        dev_y, val_y = target.iloc[dev_index], target.iloc[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, fi = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        feature_importances[f'fold_{i}'] = fi
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            print(label + ' cv score {}: {}'.format(i, cv_score), '\n')
        i += 1
    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / n_folds
    results = {'label': label,
              'train': pred_train, 'test': pred_full_test,
              'cv': cv_scores, 'fi': feature_importances}
    return results


def runCAT(train_X, train_y, test_X, test_y, test_X2, params):
    # Pool the data and specify the categorical feature indices
    print('Pool Data')
    _train = Pool(train_X, label=train_y)
    _valid = Pool(test_X, label=test_y)    
    print('Train CAT')
    model = CatBoostClassifier(**params)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=1000,
                          plot=False)
    feature_im = fit_model.feature_importances_
    print('Predict 1/2')
    pred_test_y = logit(fit_model.predict_proba(test_X)[:, 1])
    print('Predict 2/2')
    pred_test_y2 = logit(fit_model.predict_proba(test_X2)[:, 1])
    return pred_test_y, pred_test_y2, feature_im

In [37]:
# Use some baseline parameters
cat_params = {'loss_function': 'CrossEntropy', 
              'eval_metric': "AUC",
              'learning_rate': 0.01,
              'iterations': 10000,
              'random_seed': 42,
              'od_type': "Iter",
              'early_stopping_rounds': 150,
             }

n_folds = 10
results = run_cv_model(train[train_cols].fillna(0), test[train_cols].fillna(0), train['redemption_status'], runCAT, cat_params, auc, 'cat', n_folds=n_folds)
day = 4
sub = 2
name = f"day_{day}_sub_{sub}"
tmp = dict(zip(test.id.values, results['test']))

-------------------------------------------
Started cat fold 1/10
Pool Data
Train CAT
0:	test: 0.6800045	best: 0.6800045 (0)	total: 217ms	remaining: 36m 10s
1000:	test: 0.9241176	best: 0.9241529 (998)	total: 51.1s	remaining: 7m 39s
2000:	test: 0.9284086	best: 0.9285127 (1979)	total: 1m 37s	remaining: 6m 29s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.9289273288
bestIteration = 2290

Shrink model to first 2291 iterations.
Predict 1/2
Predict 2/2
cat cv score 1: 0.9289273288024108 

-------------------------------------------
Started cat fold 2/10
Pool Data
Train CAT
0:	test: 0.6983628	best: 0.6983628 (0)	total: 61.5ms	remaining: 10m 15s
1000:	test: 0.9147488	best: 0.9147629 (999)	total: 47.8s	remaining: 7m 9s
2000:	test: 0.9235424	best: 0.9235795 (1992)	total: 1m 34s	remaining: 6m 16s
3000:	test: 0.9271541	best: 0.9272653 (2943)	total: 2m 20s	remaining: 5m 28s
4000:	test: 0.9288162	best: 0.9288479 (3995)	total: 3m 7s	remaining: 4m 40s
Stopped by overfitting dete

In [38]:
Prediction_Outcome = pd.DataFrame()
Prediction_Outcome['id'] = test.id.values
Prediction_Outcome['redemption_status'] = Prediction_Outcome['id'].map(tmp)
Prediction_Outcome.to_csv('Prediction.csv', index = None)