In [1]:
%matplotlib inline

import os
import pandas as pd 
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib as mpl
from matplotlib import pyplot as plt
from xgboost import plot_importance
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as LGB
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB


from IPython.display import display
import warnings
warnings.filterwarnings('ignore')


mpl.rc("figure", figsize=(100, 70))
mpl.rcParams.update({'font.size':25})

pd.set_option('display.max_columns', 500)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from numpy.core.umath_tests import inner1d


# Loading data

In [2]:
merged_data = pd.read_csv("../data/train/merged_data_after_feature_engineering.csv")

## Separating the train test features

In [3]:
train = merged_data[merged_data.is_test ==0]
test = merged_data[merged_data.is_test ==1]

train['redemption_status'] =train['redemption_status'].astype('int64')
train

Unnamed: 0,campaign_id,coupon_id,customer_id,id,is_test,redemption_status,campaign_type,campaign_start_date,campaign_end_date,coupon_available_on_item_ids,customer_age_range,customer_marital_status,is_customer_rented,customer_family_size,customer_no_of_children,customer_income_bracket,is_common_demographic_info_available_for_customer,no_of_items_coupon_is_available_for,is_coupon_available_for_multiple_item_ids,no_of_categories_of_items_available_under_coupon,coupon_available_on_category_1,coupon_available_on_category_2,coupon_available_on_brand_1,coupon_available_on_brand_2,coupon_more_focussed_on_brand_type,popularity_of_category1,popularity_of_category2,popularity_of_brand1,popularity_of_brand2,mean_discount_on_category1,mean_discount_on_category2,mean_discount_on_brand1,mean_discount_on_brand2,no_of_times_customer_bought_category1,no_of_times_customer_bought_category2,no_of_times_customer_bought_brand1,no_of_times_customer_bought_brand2,no_of_times_customer_bought_category1_in_discount,no_of_times_customer_bought_category2_in_discount,no_of_times_customer_bought_brand1_in_discount,no_of_times_customer_bought_brand2_in_discount,has_customer_ever_bought_category1,has_customer_ever_bought_category2,has_customer_ever_bought_brand1,has_customer_ever_bought_brand2,has_customer_ever_bought_category1_in_discount,has_customer_ever_bought_category2_in_discount,has_customer_ever_bought_brand1_in_discount,has_customer_ever_bought_brand2_in_discount,average_selling_price_per_unit_of_category1_brand1_combination,average_selling_price_per_unit_of_category2_brand2_combination,average_transaction_amount_for_customer_per_item_of_category1,average_transaction_amount_for_customer_per_item_of_category2,is_customers_spending_habit_on_category1_in_favour_of_category1_brand1_combo,is_customers_spending_habit_on_category1_in_favour_of_category2_brand2_combo,campaign_duration_in_days,was_user_active_during_campaign_period,does_user_significantly_prefer_focussed_brand_type,does_user_prefer_category1_from_brand1,does_user_prefer_category1_from_brand2,does_user_prefer_any_of_brand1_brand2_for_category1,total_spends_of_customer,total_spends_category_of_customer,total_coupon_discount_claimed_by_customer,coupon_claiming_category_of_customer,most_bought_category_of_user,is_coupon_category1_same_as_customer_most_bought_category,is_brand1_same_as_most_favourite_brand_of_user_in_category1,is_brand2_same_as_second_most_favourite_brand_of_user_in_category1,is_brand2_same_as_most_favourite_brand_of_user_in_category1,is_brand1_same_as_second_most_favourite_brand_of_user_in_category1,are_the_brands_customers_favourite_brands_for_category1
50226,13,27,1053,1,0,0,0,2013-05-19,2013-07-05,"38088,40251,38591,36817,56951,40715,38374,3960...",4,0,0.0,1,0,5.0,1,124,1,1,3,0,1105,1636,0,0.711382,0.0,0.003298,0.001451,22.507003,0.0,61.319363,70.109160,214,0,0,0,103,0,0,0,1,0,0,0,1,0,0,0,75.475914,-1.0,68.848024,0.0,0,1,47,1,0,0,0,0,57120.75,1,89.05,2,1,1,0,0,0,0,0
50227,13,116,48,2,0,0,0,2013-05-19,2013-07-05,36721395814157,3,1,0.0,2,0,3.0,1,2,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,238,0,91,0,160,0,59,0,1,0,1,0,1,0,1,0,50.601550,-1.0,63.827453,0.0,1,1,47,1,0,1,0,1,90185.10,2,1237.79,4,1,1,1,0,0,0,1
50228,9,635,205,6,0,0,1,2013-03-11,2013-04-12,"12028,23534,28856,30989,27428,36001,12779,1341...",4,1,0.0,2,0,7.0,1,66,1,1,6,0,560,-1,0,0.117204,0.0,0.000129,0.000000,11.490816,0.0,47.960747,0.000000,86,0,0,0,23,0,0,0,1,0,0,0,1,0,0,0,233.863843,-1.0,127.284608,0.0,0,1,32,1,1,0,0,0,117461.66,3,2145.72,5,1,0,0,0,0,0,0
50229,13,644,1050,7,0,0,0,2013-05-19,2013-07-05,45243569466596212342,-1,-1,-1.0,-1,-1,-1.0,0,3,1,1,3,0,611,-1,0,0.711382,0.0,0.000390,0.000000,22.507003,0.0,16.618030,0.000000,185,0,0,0,99,0,0,0,1,0,0,0,1,0,0,0,74.420565,-1.0,68.157042,0.0,0,1,47,1,0,0,0,0,23291.42,1,178.10,2,1,1,0,0,0,0,0
50230,8,1017,1489,9,0,0,0,2013-02-16,2013-04-05,"5969,48167,43567,13440,1996,6538,51029,48640,1...",4,1,0.0,2,0,3.0,1,31,1,1,3,0,1558,-1,0,0.711382,0.0,0.002604,0.000000,22.507003,0.0,24.645294,0.000000,404,0,0,0,205,0,0,0,1,0,0,0,1,0,0,0,146.231028,-1.0,47.515457,0.0,0,1,48,1,1,0,0,0,67797.49,2,265.01,3,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128590,8,71,1523,128587,0,0,0,2013-02-16,2013-04-05,1702916573015512762,6,1,0.0,2,0,1.0,1,3,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,173,0,50,0,91,0,25,0,1,0,1,0,1,0,1,0,50.601550,-1.0,89.835000,0.0,1,1,48,1,0,0,0,0,53058.23,1,0.00,0,1,1,1,0,0,0,1
128591,30,547,937,128589,0,0,0,2012-11-19,2013-01-04,259618777530614238643044331839336,6,1,0.0,2,0,2.0,1,6,1,1,3,0,2188,2179,0,0.711382,0.0,0.000989,0.000000,22.507003,0.0,10.752388,49.284644,165,0,0,0,84,0,0,0,1,0,0,0,1,0,0,0,97.851208,-1.0,61.067310,0.0,0,1,46,1,0,0,0,0,22570.47,1,89.05,2,1,1,0,0,0,0,0
128592,8,754,1004,128590,0,0,0,2013-02-16,2013-04-05,"70460,70534,9060,27164,58588,20849,5608,24948,...",-1,-1,-1.0,-1,-1,-1.0,0,82,1,1,3,0,864,-1,0,0.711382,0.0,0.001802,0.000000,22.507003,0.0,61.623552,0.000000,1324,0,2,0,725,0,2,0,1,0,1,0,1,0,1,0,118.686482,-1.0,68.054327,0.0,0,1,48,1,1,0,0,0,195328.75,4,260.03,3,1,1,0,0,0,0,0
128593,13,134,71,128592,0,0,0,2013-05-19,2013-07-05,622191357387764114832563624574047,3,1,0.0,5,3,4.0,1,6,1,1,3,0,11,56,1,0.711382,0.0,0.000000,0.264804,22.507003,0.0,14.411116,18.967506,788,0,1,101,589,0,0,57,1,0,1,1,1,0,0,1,51.438845,-1.0,57.893748,0.0,1,1,47,1,0,0,0,0,81141.98,2,1325.04,4,1,1,0,0,1,0,1


In [4]:
test.shape

(50226, 72)

In [5]:
columns_with_other_dtype = merged_data.select_dtypes(include=['object', 'datetime64[ns]']).columns
columns_with_numeric_dtype = merged_data.select_dtypes(exclude='object').columns
columns_with_numeric_dtype



Index(['campaign_id', 'coupon_id', 'customer_id', 'id', 'is_test',
       'redemption_status', 'campaign_type', 'customer_age_range',
       'customer_marital_status', 'is_customer_rented', 'customer_family_size',
       'customer_no_of_children', 'customer_income_bracket',
       'is_common_demographic_info_available_for_customer',
       'no_of_items_coupon_is_available_for',
       'is_coupon_available_for_multiple_item_ids',
       'no_of_categories_of_items_available_under_coupon',
       'coupon_available_on_category_1', 'coupon_available_on_category_2',
       'coupon_available_on_brand_1', 'coupon_available_on_brand_2',
       'coupon_more_focussed_on_brand_type', 'popularity_of_category1',
       'popularity_of_category2', 'popularity_of_brand1',
       'popularity_of_brand2', 'mean_discount_on_category1',
       'mean_discount_on_category2', 'mean_discount_on_brand1',
       'mean_discount_on_brand2', 'no_of_times_customer_bought_category1',
       'no_of_times_customer_bough

In [6]:
categorical_columns = columns_with_other_dtype.drop(labels=['campaign_start_date', 'campaign_end_date', 'coupon_available_on_item_ids'])
numerical_columns = columns_with_numeric_dtype.drop(labels=['coupon_id', 'customer_id', 'id'])
features = numerical_columns.append(categorical_columns)
features

Index(['campaign_id', 'is_test', 'redemption_status', 'campaign_type',
       'customer_age_range', 'customer_marital_status', 'is_customer_rented',
       'customer_family_size', 'customer_no_of_children',
       'customer_income_bracket',
       'is_common_demographic_info_available_for_customer',
       'no_of_items_coupon_is_available_for',
       'is_coupon_available_for_multiple_item_ids',
       'no_of_categories_of_items_available_under_coupon',
       'coupon_available_on_category_1', 'coupon_available_on_category_2',
       'coupon_available_on_brand_1', 'coupon_available_on_brand_2',
       'coupon_more_focussed_on_brand_type', 'popularity_of_category1',
       'popularity_of_category2', 'popularity_of_brand1',
       'popularity_of_brand2', 'mean_discount_on_category1',
       'mean_discount_on_category2', 'mean_discount_on_brand1',
       'mean_discount_on_brand2', 'no_of_times_customer_bought_category1',
       'no_of_times_customer_bought_category2',
       'no_of_times_

In [7]:
features = features.drop(labels=['is_test', 'redemption_status', 'campaign_id'])

In [8]:
X = train[features]
Y = train['redemption_status']

skf = StratifiedKFold(n_splits=3)

In [9]:
X

Unnamed: 0,campaign_type,customer_age_range,customer_marital_status,is_customer_rented,customer_family_size,customer_no_of_children,customer_income_bracket,is_common_demographic_info_available_for_customer,no_of_items_coupon_is_available_for,is_coupon_available_for_multiple_item_ids,no_of_categories_of_items_available_under_coupon,coupon_available_on_category_1,coupon_available_on_category_2,coupon_available_on_brand_1,coupon_available_on_brand_2,coupon_more_focussed_on_brand_type,popularity_of_category1,popularity_of_category2,popularity_of_brand1,popularity_of_brand2,mean_discount_on_category1,mean_discount_on_category2,mean_discount_on_brand1,mean_discount_on_brand2,no_of_times_customer_bought_category1,no_of_times_customer_bought_category2,no_of_times_customer_bought_brand1,no_of_times_customer_bought_brand2,no_of_times_customer_bought_category1_in_discount,no_of_times_customer_bought_category2_in_discount,no_of_times_customer_bought_brand1_in_discount,no_of_times_customer_bought_brand2_in_discount,has_customer_ever_bought_category1,has_customer_ever_bought_category2,has_customer_ever_bought_brand1,has_customer_ever_bought_brand2,has_customer_ever_bought_category1_in_discount,has_customer_ever_bought_category2_in_discount,has_customer_ever_bought_brand1_in_discount,has_customer_ever_bought_brand2_in_discount,average_selling_price_per_unit_of_category1_brand1_combination,average_selling_price_per_unit_of_category2_brand2_combination,average_transaction_amount_for_customer_per_item_of_category1,average_transaction_amount_for_customer_per_item_of_category2,is_customers_spending_habit_on_category1_in_favour_of_category1_brand1_combo,is_customers_spending_habit_on_category1_in_favour_of_category2_brand2_combo,campaign_duration_in_days,was_user_active_during_campaign_period,does_user_significantly_prefer_focussed_brand_type,does_user_prefer_category1_from_brand1,does_user_prefer_category1_from_brand2,does_user_prefer_any_of_brand1_brand2_for_category1,total_spends_of_customer,total_spends_category_of_customer,total_coupon_discount_claimed_by_customer,coupon_claiming_category_of_customer,most_bought_category_of_user,is_coupon_category1_same_as_customer_most_bought_category,is_brand1_same_as_most_favourite_brand_of_user_in_category1,is_brand2_same_as_second_most_favourite_brand_of_user_in_category1,is_brand2_same_as_most_favourite_brand_of_user_in_category1,is_brand1_same_as_second_most_favourite_brand_of_user_in_category1,are_the_brands_customers_favourite_brands_for_category1
50226,0,4,0,0.0,1,0,5.0,1,124,1,1,3,0,1105,1636,0,0.711382,0.0,0.003298,0.001451,22.507003,0.0,61.319363,70.109160,214,0,0,0,103,0,0,0,1,0,0,0,1,0,0,0,75.475914,-1.0,68.848024,0.0,0,1,47,1,0,0,0,0,57120.75,1,89.05,2,1,1,0,0,0,0,0
50227,0,3,1,0.0,2,0,3.0,1,2,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,238,0,91,0,160,0,59,0,1,0,1,0,1,0,1,0,50.601550,-1.0,63.827453,0.0,1,1,47,1,0,1,0,1,90185.10,2,1237.79,4,1,1,1,0,0,0,1
50228,1,4,1,0.0,2,0,7.0,1,66,1,1,6,0,560,-1,0,0.117204,0.0,0.000129,0.000000,11.490816,0.0,47.960747,0.000000,86,0,0,0,23,0,0,0,1,0,0,0,1,0,0,0,233.863843,-1.0,127.284608,0.0,0,1,32,1,1,0,0,0,117461.66,3,2145.72,5,1,0,0,0,0,0,0
50229,0,-1,-1,-1.0,-1,-1,-1.0,0,3,1,1,3,0,611,-1,0,0.711382,0.0,0.000390,0.000000,22.507003,0.0,16.618030,0.000000,185,0,0,0,99,0,0,0,1,0,0,0,1,0,0,0,74.420565,-1.0,68.157042,0.0,0,1,47,1,0,0,0,0,23291.42,1,178.10,2,1,1,0,0,0,0,0
50230,0,4,1,0.0,2,0,3.0,1,31,1,1,3,0,1558,-1,0,0.711382,0.0,0.002604,0.000000,22.507003,0.0,24.645294,0.000000,404,0,0,0,205,0,0,0,1,0,0,0,1,0,0,0,146.231028,-1.0,47.515457,0.0,0,1,48,1,1,0,0,0,67797.49,2,265.01,3,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128590,0,6,1,0.0,2,0,1.0,1,3,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,173,0,50,0,91,0,25,0,1,0,1,0,1,0,1,0,50.601550,-1.0,89.835000,0.0,1,1,48,1,0,0,0,0,53058.23,1,0.00,0,1,1,1,0,0,0,1
128591,0,6,1,0.0,2,0,2.0,1,6,1,1,3,0,2188,2179,0,0.711382,0.0,0.000989,0.000000,22.507003,0.0,10.752388,49.284644,165,0,0,0,84,0,0,0,1,0,0,0,1,0,0,0,97.851208,-1.0,61.067310,0.0,0,1,46,1,0,0,0,0,22570.47,1,89.05,2,1,1,0,0,0,0,0
128592,0,-1,-1,-1.0,-1,-1,-1.0,0,82,1,1,3,0,864,-1,0,0.711382,0.0,0.001802,0.000000,22.507003,0.0,61.623552,0.000000,1324,0,2,0,725,0,2,0,1,0,1,0,1,0,1,0,118.686482,-1.0,68.054327,0.0,0,1,48,1,1,0,0,0,195328.75,4,260.03,3,1,1,0,0,0,0,0
128593,0,3,1,0.0,5,3,4.0,1,6,1,1,3,0,11,56,1,0.711382,0.0,0.000000,0.264804,22.507003,0.0,14.411116,18.967506,788,0,1,101,589,0,0,57,1,0,1,1,1,0,0,1,51.438845,-1.0,57.893748,0.0,1,1,47,1,0,0,0,0,81141.98,2,1325.04,4,1,1,0,0,1,0,1


In [10]:
meta_df = pd.DataFrame({'true_values': Y})
meta_df.reset_index(inplace=True)
meta_df

Unnamed: 0,index,true_values
0,50226,0
1,50227,0
2,50228,0
3,50229,0
4,50230,0
...,...,...
78364,128590,0
78365,128591,0
78366,128592,0
78367,128593,0


In [11]:
X

Unnamed: 0,campaign_type,customer_age_range,customer_marital_status,is_customer_rented,customer_family_size,customer_no_of_children,customer_income_bracket,is_common_demographic_info_available_for_customer,no_of_items_coupon_is_available_for,is_coupon_available_for_multiple_item_ids,no_of_categories_of_items_available_under_coupon,coupon_available_on_category_1,coupon_available_on_category_2,coupon_available_on_brand_1,coupon_available_on_brand_2,coupon_more_focussed_on_brand_type,popularity_of_category1,popularity_of_category2,popularity_of_brand1,popularity_of_brand2,mean_discount_on_category1,mean_discount_on_category2,mean_discount_on_brand1,mean_discount_on_brand2,no_of_times_customer_bought_category1,no_of_times_customer_bought_category2,no_of_times_customer_bought_brand1,no_of_times_customer_bought_brand2,no_of_times_customer_bought_category1_in_discount,no_of_times_customer_bought_category2_in_discount,no_of_times_customer_bought_brand1_in_discount,no_of_times_customer_bought_brand2_in_discount,has_customer_ever_bought_category1,has_customer_ever_bought_category2,has_customer_ever_bought_brand1,has_customer_ever_bought_brand2,has_customer_ever_bought_category1_in_discount,has_customer_ever_bought_category2_in_discount,has_customer_ever_bought_brand1_in_discount,has_customer_ever_bought_brand2_in_discount,average_selling_price_per_unit_of_category1_brand1_combination,average_selling_price_per_unit_of_category2_brand2_combination,average_transaction_amount_for_customer_per_item_of_category1,average_transaction_amount_for_customer_per_item_of_category2,is_customers_spending_habit_on_category1_in_favour_of_category1_brand1_combo,is_customers_spending_habit_on_category1_in_favour_of_category2_brand2_combo,campaign_duration_in_days,was_user_active_during_campaign_period,does_user_significantly_prefer_focussed_brand_type,does_user_prefer_category1_from_brand1,does_user_prefer_category1_from_brand2,does_user_prefer_any_of_brand1_brand2_for_category1,total_spends_of_customer,total_spends_category_of_customer,total_coupon_discount_claimed_by_customer,coupon_claiming_category_of_customer,most_bought_category_of_user,is_coupon_category1_same_as_customer_most_bought_category,is_brand1_same_as_most_favourite_brand_of_user_in_category1,is_brand2_same_as_second_most_favourite_brand_of_user_in_category1,is_brand2_same_as_most_favourite_brand_of_user_in_category1,is_brand1_same_as_second_most_favourite_brand_of_user_in_category1,are_the_brands_customers_favourite_brands_for_category1
50226,0,4,0,0.0,1,0,5.0,1,124,1,1,3,0,1105,1636,0,0.711382,0.0,0.003298,0.001451,22.507003,0.0,61.319363,70.109160,214,0,0,0,103,0,0,0,1,0,0,0,1,0,0,0,75.475914,-1.0,68.848024,0.0,0,1,47,1,0,0,0,0,57120.75,1,89.05,2,1,1,0,0,0,0,0
50227,0,3,1,0.0,2,0,3.0,1,2,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,238,0,91,0,160,0,59,0,1,0,1,0,1,0,1,0,50.601550,-1.0,63.827453,0.0,1,1,47,1,0,1,0,1,90185.10,2,1237.79,4,1,1,1,0,0,0,1
50228,1,4,1,0.0,2,0,7.0,1,66,1,1,6,0,560,-1,0,0.117204,0.0,0.000129,0.000000,11.490816,0.0,47.960747,0.000000,86,0,0,0,23,0,0,0,1,0,0,0,1,0,0,0,233.863843,-1.0,127.284608,0.0,0,1,32,1,1,0,0,0,117461.66,3,2145.72,5,1,0,0,0,0,0,0
50229,0,-1,-1,-1.0,-1,-1,-1.0,0,3,1,1,3,0,611,-1,0,0.711382,0.0,0.000390,0.000000,22.507003,0.0,16.618030,0.000000,185,0,0,0,99,0,0,0,1,0,0,0,1,0,0,0,74.420565,-1.0,68.157042,0.0,0,1,47,1,0,0,0,0,23291.42,1,178.10,2,1,1,0,0,0,0,0
50230,0,4,1,0.0,2,0,3.0,1,31,1,1,3,0,1558,-1,0,0.711382,0.0,0.002604,0.000000,22.507003,0.0,24.645294,0.000000,404,0,0,0,205,0,0,0,1,0,0,0,1,0,0,0,146.231028,-1.0,47.515457,0.0,0,1,48,1,1,0,0,0,67797.49,2,265.01,3,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128590,0,6,1,0.0,2,0,1.0,1,3,1,1,3,0,56,-1,1,0.711382,0.0,0.264804,0.000000,22.507003,0.0,18.967506,0.000000,173,0,50,0,91,0,25,0,1,0,1,0,1,0,1,0,50.601550,-1.0,89.835000,0.0,1,1,48,1,0,0,0,0,53058.23,1,0.00,0,1,1,1,0,0,0,1
128591,0,6,1,0.0,2,0,2.0,1,6,1,1,3,0,2188,2179,0,0.711382,0.0,0.000989,0.000000,22.507003,0.0,10.752388,49.284644,165,0,0,0,84,0,0,0,1,0,0,0,1,0,0,0,97.851208,-1.0,61.067310,0.0,0,1,46,1,0,0,0,0,22570.47,1,89.05,2,1,1,0,0,0,0,0
128592,0,-1,-1,-1.0,-1,-1,-1.0,0,82,1,1,3,0,864,-1,0,0.711382,0.0,0.001802,0.000000,22.507003,0.0,61.623552,0.000000,1324,0,2,0,725,0,2,0,1,0,1,0,1,0,1,0,118.686482,-1.0,68.054327,0.0,0,1,48,1,1,0,0,0,195328.75,4,260.03,3,1,1,0,0,0,0,0
128593,0,3,1,0.0,5,3,4.0,1,6,1,1,3,0,11,56,1,0.711382,0.0,0.000000,0.264804,22.507003,0.0,14.411116,18.967506,788,0,1,101,589,0,0,57,1,0,1,1,1,0,0,1,51.438845,-1.0,57.893748,0.0,1,1,47,1,0,0,0,0,81141.98,2,1325.04,4,1,1,0,0,1,0,1


In [12]:
test

Unnamed: 0,campaign_id,coupon_id,customer_id,id,is_test,redemption_status,campaign_type,campaign_start_date,campaign_end_date,coupon_available_on_item_ids,customer_age_range,customer_marital_status,is_customer_rented,customer_family_size,customer_no_of_children,customer_income_bracket,is_common_demographic_info_available_for_customer,no_of_items_coupon_is_available_for,is_coupon_available_for_multiple_item_ids,no_of_categories_of_items_available_under_coupon,coupon_available_on_category_1,coupon_available_on_category_2,coupon_available_on_brand_1,coupon_available_on_brand_2,coupon_more_focussed_on_brand_type,popularity_of_category1,popularity_of_category2,popularity_of_brand1,popularity_of_brand2,mean_discount_on_category1,mean_discount_on_category2,mean_discount_on_brand1,mean_discount_on_brand2,no_of_times_customer_bought_category1,no_of_times_customer_bought_category2,no_of_times_customer_bought_brand1,no_of_times_customer_bought_brand2,no_of_times_customer_bought_category1_in_discount,no_of_times_customer_bought_category2_in_discount,no_of_times_customer_bought_brand1_in_discount,no_of_times_customer_bought_brand2_in_discount,has_customer_ever_bought_category1,has_customer_ever_bought_category2,has_customer_ever_bought_brand1,has_customer_ever_bought_brand2,has_customer_ever_bought_category1_in_discount,has_customer_ever_bought_category2_in_discount,has_customer_ever_bought_brand1_in_discount,has_customer_ever_bought_brand2_in_discount,average_selling_price_per_unit_of_category1_brand1_combination,average_selling_price_per_unit_of_category2_brand2_combination,average_transaction_amount_for_customer_per_item_of_category1,average_transaction_amount_for_customer_per_item_of_category2,is_customers_spending_habit_on_category1_in_favour_of_category1_brand1_combo,is_customers_spending_habit_on_category1_in_favour_of_category2_brand2_combo,campaign_duration_in_days,was_user_active_during_campaign_period,does_user_significantly_prefer_focussed_brand_type,does_user_prefer_category1_from_brand1,does_user_prefer_category1_from_brand2,does_user_prefer_any_of_brand1_brand2_for_category1,total_spends_of_customer,total_spends_category_of_customer,total_coupon_discount_claimed_by_customer,coupon_claiming_category_of_customer,most_bought_category_of_user,is_coupon_category1_same_as_customer_most_bought_category,is_brand1_same_as_most_favourite_brand_of_user_in_category1,is_brand2_same_as_second_most_favourite_brand_of_user_in_category1,is_brand2_same_as_most_favourite_brand_of_user_in_category1,is_brand1_same_as_second_most_favourite_brand_of_user_in_category1,are_the_brands_customers_favourite_brands_for_category1
0,22,869,967,3,1,,0,2013-09-16,2013-10-18,"1124,1185,39835,46911,3514,1033,45271,56179,40...",3,0,0.0,1,0,5.0,1,71,1,1,3,0,1075,-1,0,0.711382,0.000000,0.003300,0.000000,22.507003,0.000000,37.102000,0.000000,980,0,15,0,543,0,14,0,1,0,1,0,1,0,1,0,108.805335,-1.0,90.982959,0.000000,0,1,32,0,1,0,0,0,156733.77,4,2111.86,5,1,1,0,0,0,0,0
1,20,389,1566,4,1,,1,2013-09-07,2013-11-16,"47705,45063,19431,56517,1479,68534,56523,1144,...",2,1,0.0,2,0,9.0,1,32,1,1,6,0,57,1009,0,0.117204,0.000000,0.001626,0.000304,11.490816,0.000000,17.556861,13.948629,622,0,11,0,123,0,3,0,1,0,1,0,1,0,1,0,134.793144,-1.0,103.423798,0.000000,0,1,70,0,1,0,0,0,297070.87,5,1489.97,4,1,0,0,0,0,0,0
2,22,981,510,5,1,,0,2013-09-16,2013-10-18,"1690,7061,58004,22631,6577,2398,41769,21773,33...",2,1,0.0,2,1,1.0,1,18,1,1,3,0,1335,-1,0,0.711382,0.000000,0.000987,0.000000,22.507003,0.000000,17.113015,0.000000,1236,0,0,0,700,0,0,0,1,0,0,0,1,0,0,0,79.543017,-1.0,55.211986,0.000000,0,1,32,0,1,0,0,0,161811.27,4,2384.75,5,1,1,0,0,0,0,0
3,25,1069,361,8,1,,1,2013-10-21,2013-11-22,"10284,70009,57994,11281,44510,35356,29737,6994...",1,0,0.0,1,0,3.0,1,73,1,1,3,0,1996,-1,0,0.711382,0.000000,0.002861,0.000000,22.507003,0.000000,58.302656,0.000000,454,0,0,0,204,0,0,0,1,0,0,0,1,0,0,0,99.960960,-1.0,58.620148,0.000000,0,1,32,0,0,0,0,0,49149.16,1,0.00,0,1,1,0,0,0,0,0
4,17,498,811,10,1,,1,2013-07-29,2013-08-30,"45021,54805,14259,37463,48668,16481,4078,19852...",-1,-1,-1.0,-1,-1,-1.0,0,17,1,1,3,0,209,-1,0,0.711382,0.000000,0.006342,0.000000,22.507003,0.000000,21.023715,0.000000,994,0,8,0,484,0,5,0,1,0,1,0,1,0,1,0,68.246478,-1.0,52.986002,0.000000,0,1,32,0,0,0,0,0,117044.09,3,89.04,2,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50221,20,843,501,128584,1,,1,2013-09-07,2013-11-16,6609466031658956590573540658946607865854,4,0,0.0,1,0,5.0,1,7,1,1,3,0,1009,57,0,0.711382,0.000000,0.000304,0.001626,22.507003,0.000000,13.948629,17.556861,1219,0,6,7,840,0,5,5,1,0,1,1,1,0,1,1,141.539074,-1.0,61.246401,0.000000,0,1,70,0,1,0,0,0,154702.14,4,241.85,3,1,1,0,0,0,0,0
50222,25,415,481,128588,1,,1,2013-10-21,2013-11-22,"53917,38541,53519,44431,47604,37040,23296,1619...",3,1,1.0,4,2,2.0,1,106,1,1,3,0,89,-1,0,0.711382,0.000000,0.017482,0.000000,22.507003,0.000000,38.099900,0.000000,1600,0,79,0,659,0,73,0,1,0,1,0,1,0,1,0,82.935282,-1.0,44.249305,0.000000,0,1,32,0,0,0,0,0,111537.83,3,1354.98,4,1,1,0,1,0,1,1
50223,25,596,1336,128591,1,,1,2013-10-21,2013-11-22,"3556,48890,42515,105,44440,1001,36451,46855,68...",3,1,0.0,5,3,9.0,1,68,1,2,3,1,487,-1,0,0.711382,0.015450,0.023221,0.000000,22.507003,15.867442,14.869981,0.000000,1573,3,87,0,874,2,38,0,1,1,1,0,1,1,1,0,61.157145,-1.0,80.561195,112.676667,1,1,32,0,1,0,0,0,331724.69,5,5195.86,5,1,1,0,1,0,1,1
50224,22,518,748,128593,1,,0,2013-09-16,2013-10-18,"34936,44407,42927,3264,3544,979,65282,23340,14...",4,1,0.0,3,1,8.0,1,69,1,1,3,0,278,-1,0,0.711382,0.000000,0.018100,0.000000,22.507003,0.000000,24.820366,0.000000,1810,0,24,0,1159,0,16,0,1,0,1,0,1,0,1,0,72.980151,-1.0,71.128754,0.000000,0,1,32,0,1,0,0,0,308723.95,5,10752.81,5,1,1,0,0,0,0,0


# Training the final XGB

In [13]:
%%time

param_grid = {
    'n_estimators': [2000],
    'learning_rate': [0.3],
    'max_depth' : [3],
    'min_child_weight' : [0.5]
    
}
xgb = XGBClassifier(objective='binary:logistic', nthreads= -1, seed=27, verbosity=2)

clf = GridSearchCV(xgb, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000 
[CV] learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000 
[CV] learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000 
[CV]  learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000, score=0.9655686899332786, total=312.5min
[CV]  learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000, score=0.9670133124709803, total=312.5min
[CV]  learning_rate=0.3, max_depth=3, min_child_weight=0.5, n_estimators=2000, score=0.9656576570559914, total=312.5min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 312.7min finished


CPU times: user 8min 46s, sys: 1.44 s, total: 8min 48s
Wall time: 16h 8min 38s


In [14]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.9999996289723807

In [15]:
sum(clf.predict(test[features]))

104

### Saving the submission

In [16]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_xgb"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")

meta_df['xgb']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb
0,50226,0,2.122677e-11
1,50227,0,5.017998e-08
2,50228,0,9.317077e-07
3,50229,0,2.148611e-08
4,50230,0,9.347421e-10
...,...,...,...
78364,128590,0,6.275490e-10
78365,128591,0,1.272463e-10
78366,128592,0,5.492687e-05
78367,128593,0,2.759433e-07


# Training the final LGBM model

In [17]:
%%time


param_grid = {
    'n_estimators': [4000],
    'learning_rate': [0.2],
    'max_depth': [5],
    'min_data_in_leaf' : [5],
    'num_leaves': [30],
    'feature_fraction': [0.3],
    'drop_rate': [0.2]
    
}
lgbm = LGB.LGBMClassifier(objective='binary', nthreads=-1, seed=27, verbosity=2, is_imbalance=True, metric='auc')

clf = GridSearchCV(lgbm, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30 
[CV] drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30 
[CV] drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30 
[CV]  drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30, score=0.9673125727479154, total= 6.1min
[CV]  drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30, score=0.9623412266809142, total= 6.5min
[CV]  drop_rate=0.2, feature_fraction=0.3, learning_rate=0.2, max_depth=5, min_data_in_leaf=5, n_estimators=4000, num_leaves=30, score=0.963556395138054, total= 6.7min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  7.9min finished


CPU times: user 54.2 s, sys: 920 ms, total: 55.1 s
Wall time: 8min 54s


In [18]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.9999996819763263

In [19]:
sum(clf.predict(test[features]))

56

### Saving the submission

In [20]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_lgb_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

meta_df['lgbm']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb,lgbm
0,50226,0,2.122677e-11,9.728650e-21
1,50227,0,5.017998e-08,4.032008e-13
2,50228,0,9.317077e-07,2.834561e-11
3,50229,0,2.148611e-08,5.806925e-14
4,50230,0,9.347421e-10,6.840081e-20
...,...,...,...,...
78364,128590,0,6.275490e-10,5.759423e-19
78365,128591,0,1.272463e-10,6.497514e-23
78366,128592,0,5.492687e-05,1.293827e-09
78367,128593,0,2.759433e-07,1.129985e-15


# Training all weak models

### Training Log-reg

In [21]:
%%time
param_grid = {
    'C' : [1],
    'max_iter' : [500],
    'penalty': ['l1']
}

lr = LogisticRegression(verbose=2, random_state=27, class_weight='balanced')

clf = GridSearchCV(lr, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] C=1, max_iter=500, penalty=l1 ...................................
[CV] C=1, max_iter=500, penalty=l1 ...................................
[CV] C=1, max_iter=500, penalty=l1 ...................................
[LibLinear][CV]  C=1, max_iter=500, penalty=l1, score=0.9466574121777624, total= 3.2min
[LibLinear][CV]  C=1, max_iter=500, penalty=l1, score=0.94301556407859, total= 3.4min
[LibLinear][CV]  C=1, max_iter=500, penalty=l1, score=0.9489743736523746, total= 5.3min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  5.3min finished


[LibLinear]CPU times: user 5min 58s, sys: 1.2 s, total: 5min 59s
Wall time: 11min 34s


In [22]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.9528923723081947

In [23]:
sum(clf.predict(test[features]))

211

### Saving the submission

In [24]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_log_reg_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

meta_df['log_reg']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb,lgbm,log_reg
0,50226,0,2.122677e-11,9.728650e-21,0.000048
1,50227,0,5.017998e-08,4.032008e-13,0.169548
2,50228,0,9.317077e-07,2.834561e-11,0.000273
3,50229,0,2.148611e-08,5.806925e-14,0.000062
4,50230,0,9.347421e-10,6.840081e-20,0.000271
...,...,...,...,...,...
78364,128590,0,6.275490e-10,5.759423e-19,0.001113
78365,128591,0,1.272463e-10,6.497514e-23,0.000057
78366,128592,0,5.492687e-05,1.293827e-09,0.753502
78367,128593,0,2.759433e-07,1.129985e-15,0.000125


### Training BernoulliNB

In [25]:
%%time
param_grid = {
    'alpha' : [0.001]
}

bnb = BernoulliNB()

clf = GridSearchCV(bnb, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] alpha=0.001 .....................................................
[CV] ............ alpha=0.001, score=0.9002439241577143, total=   0.6s
[CV] ............ alpha=0.001, score=0.8992504182011309, total=   0.6s
[CV] ............ alpha=0.001, score=0.8795811310193931, total=   0.7s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


CPU times: user 505 ms, sys: 197 ms, total: 702 ms
Wall time: 1.87 s


In [26]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.8945174750475092

In [27]:
sum(clf.predict(test[features]))

631

### Saving the submission

In [28]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_BernoulliNB_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

meta_df['BernoulliNB']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb,lgbm,log_reg,BernoulliNB
0,50226,0,2.122677e-11,9.728650e-21,0.000048,3.790410e-17
1,50227,0,5.017998e-08,4.032008e-13,0.169548,2.284327e-03
2,50228,0,9.317077e-07,2.834561e-11,0.000273,3.235746e-18
3,50229,0,2.148611e-08,5.806925e-14,0.000062,5.998107e-20
4,50230,0,9.347421e-10,6.840081e-20,0.000271,2.058993e-17
...,...,...,...,...,...,...
78364,128590,0,6.275490e-10,5.759423e-19,0.001113,4.970198e-13
78365,128591,0,1.272463e-10,6.497514e-23,0.000057,2.382794e-17
78366,128592,0,5.492687e-05,1.293827e-09,0.753502,5.622530e-03
78367,128593,0,2.759433e-07,1.129985e-15,0.000125,3.087308e-05


### Training RF

In [29]:
%%time
param_grid = {
    'n_estimators' : [1500]
}

rf = RandomForestClassifier(oob_score =True,n_jobs = 1,random_state =1)

clf = GridSearchCV(rf, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=1500 ...............................................
[CV] n_estimators=1500 ...............................................
[CV] n_estimators=1500 ...............................................
[CV] ....... n_estimators=1500, score=0.961961029379027, total= 3.3min
[CV] ......... n_estimators=1500, score=0.9646585061792, total= 3.3min
[CV] ...... n_estimators=1500, score=0.9718708855687217, total= 3.4min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.7min finished


CPU times: user 2min 52s, sys: 4.07 s, total: 2min 56s
Wall time: 6min 42s


In [30]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.9999996819763264

In [31]:
sum(clf.predict(test[features]))

91

### Saving the submission

In [32]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_RF_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

meta_df['RF']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb,lgbm,log_reg,BernoulliNB,RF
0,50226,0,2.122677e-11,9.728650e-21,0.000048,3.790410e-17,0.000000
1,50227,0,5.017998e-08,4.032008e-13,0.169548,2.284327e-03,0.000000
2,50228,0,9.317077e-07,2.834561e-11,0.000273,3.235746e-18,0.000000
3,50229,0,2.148611e-08,5.806925e-14,0.000062,5.998107e-20,0.000000
4,50230,0,9.347421e-10,6.840081e-20,0.000271,2.058993e-17,0.000000
...,...,...,...,...,...,...,...
78364,128590,0,6.275490e-10,5.759423e-19,0.001113,4.970198e-13,0.000000
78365,128591,0,1.272463e-10,6.497514e-23,0.000057,2.382794e-17,0.000000
78366,128592,0,5.492687e-05,1.293827e-09,0.753502,5.622530e-03,0.015333
78367,128593,0,2.759433e-07,1.129985e-15,0.000125,3.087308e-05,0.000667


### Training Extra Trees

In [33]:
%%time
param_grid = {
    'n_estimators' : [1500],
    'max_features' : [9]
    
}

etc = ExtraTreesClassifier(oob_score =True, bootstrap=True,n_jobs = 1,random_state =1)

clf = GridSearchCV(etc, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(X, Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] max_features=9, n_estimators=1500 ...............................
[CV] max_features=9, n_estimators=1500 ...............................
[CV] max_features=9, n_estimators=1500 ...............................
[CV]  max_features=9, n_estimators=1500, score=0.9633317114125978, total= 3.0min
[CV]  max_features=9, n_estimators=1500, score=0.9606239783489482, total= 3.0min
[CV]  max_features=9, n_estimators=1500, score=0.9629343408323315, total= 3.1min


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.5min finished


CPU times: user 2min 34s, sys: 4.84 s, total: 2min 38s
Wall time: 6min 40s


In [34]:
roc_auc_score( Y.values,clf.predict_proba((X))[:,1])

0.9999996819763264

In [35]:
sum(clf.predict(test[features]))

83

### Saving the submission

In [36]:
c='final'
test['redemption_status'] =clf.predict_proba(test[features])[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/submission_after_modeling_ExtraTrees_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

meta_df['ExtraTrees']= pd.Series(clf.predict_proba(X)[:,1])
meta_df

File saved!


Unnamed: 0,index,true_values,xgb,lgbm,log_reg,BernoulliNB,RF,ExtraTrees
0,50226,0,2.122677e-11,9.728650e-21,0.000048,3.790410e-17,0.000000,0.000000
1,50227,0,5.017998e-08,4.032008e-13,0.169548,2.284327e-03,0.000000,0.000000
2,50228,0,9.317077e-07,2.834561e-11,0.000273,3.235746e-18,0.000000,0.000000
3,50229,0,2.148611e-08,5.806925e-14,0.000062,5.998107e-20,0.000000,0.000000
4,50230,0,9.347421e-10,6.840081e-20,0.000271,2.058993e-17,0.000000,0.000000
...,...,...,...,...,...,...,...,...
78364,128590,0,6.275490e-10,5.759423e-19,0.001113,4.970198e-13,0.000000,0.000000
78365,128591,0,1.272463e-10,6.497514e-23,0.000057,2.382794e-17,0.000000,0.000000
78366,128592,0,5.492687e-05,1.293827e-09,0.753502,5.622530e-03,0.015333,0.010667
78367,128593,0,2.759433e-07,1.129985e-15,0.000125,3.087308e-05,0.000667,0.002667


# Ensembling all submissions

In [37]:
xgb = pd.read_csv("../data/submissions/submission_after_modeling_xgbfinal.csv")
lgbm = pd.read_csv("../data/submissions/submission_after_modeling_lgb_final.csv")
log_reg = pd.read_csv("../data/submissions/submission_after_modeling_log_reg_final.csv")
BernoulliNB = pd.read_csv("../data/submissions/submission_after_modeling_BernoulliNB_final.csv")
RF = pd.read_csv("../data/submissions/submission_after_modeling_RF_final.csv")
ExtraTrees = pd.read_csv("../data/submissions/submission_after_modeling_ExtraTrees_final.csv")



In [38]:
ensembled = xgb.copy()

In [39]:
ensembled['redemption_status'] = ensembled['redemption_status']*0.25
ensembled['redemption_status'] = ensembled['redemption_status']+lgbm['redemption_status']*0.25+log_reg['redemption_status']*0.1+BernoulliNB['redemption_status']*0.1+RF['redemption_status']*0.15+ExtraTrees['redemption_status']*0.15

In [40]:
ensembled

Unnamed: 0,id,redemption_status
0,3,1.684453e-01
1,4,6.622005e-03
2,5,4.129939e-04
3,8,1.000035e-04
4,10,5.575142e-04
...,...,...
50221,128584,1.258649e-02
50222,128588,3.887333e-02
50223,128591,1.567186e-01
50224,128593,6.683353e-02


### Saving the ensembled solution

In [41]:
c='final'
name = "../data/submissions/submission_after_ensembling_lgb_xgb_weak_learners"+str(c)+".csv"
ensembled.to_csv( name,index=False)
print("File saved!")

File saved!


# Generating a meta-model on top of these

In [42]:
meta_df_X = meta_df[['xgb', 'lgbm', 'log_reg', 'BernoulliNB', 'RF', 'ExtraTrees']]
meta_df_Y = meta_df.true_values

In [43]:
meta_df_X

Unnamed: 0,xgb,lgbm,log_reg,BernoulliNB,RF,ExtraTrees
0,2.122677e-11,9.728650e-21,0.000048,3.790410e-17,0.000000,0.000000
1,5.017998e-08,4.032008e-13,0.169548,2.284327e-03,0.000000,0.000000
2,9.317077e-07,2.834561e-11,0.000273,3.235746e-18,0.000000,0.000000
3,2.148611e-08,5.806925e-14,0.000062,5.998107e-20,0.000000,0.000000
4,9.347421e-10,6.840081e-20,0.000271,2.058993e-17,0.000000,0.000000
...,...,...,...,...,...,...
78364,6.275490e-10,5.759423e-19,0.001113,4.970198e-13,0.000000,0.000000
78365,1.272463e-10,6.497514e-23,0.000057,2.382794e-17,0.000000,0.000000
78366,5.492687e-05,1.293827e-09,0.753502,5.622530e-03,0.015333,0.010667
78367,2.759433e-07,1.129985e-15,0.000125,3.087308e-05,0.000667,0.002667


In [44]:
meta_df_Y

0        0
1        0
2        0
3        0
4        0
        ..
78364    0
78365    0
78366    0
78367    0
78368    0
Name: true_values, Length: 78369, dtype: int64

In [45]:
xgb = pd.read_csv("../data/submissions/submission_after_modeling_xgbfinal.csv")
lgbm = pd.read_csv("../data/submissions/submission_after_modeling_lgb_final.csv")
log_reg = pd.read_csv("../data/submissions/submission_after_modeling_log_reg_final.csv")
BernoulliNB = pd.read_csv("../data/submissions/submission_after_modeling_BernoulliNB_final.csv")
RF = pd.read_csv("../data/submissions/submission_after_modeling_RF_final.csv")
ExtraTrees = pd.read_csv("../data/submissions/submission_after_modeling_ExtraTrees_final.csv")


meta_df_test = xgb.copy()
meta_df_test['xgb'] = xgb['redemption_status']
meta_df_test['lgbm'] = lgbm['redemption_status']

meta_df_test['log_reg'] = log_reg['redemption_status']

meta_df_test['BernoulliNB'] = BernoulliNB['redemption_status']

meta_df_test['RF'] = RF['redemption_status']

meta_df_test['ExtraTrees'] = ExtraTrees['redemption_status']



In [46]:
meta_df_test

Unnamed: 0,id,redemption_status,xgb,lgbm,log_reg,BernoulliNB,RF,ExtraTrees
0,3,1.900340e-01,1.900340e-01,5.127886e-06,5.933143e-01,4.138579e-05,0.214667,0.196000
1,4,1.709778e-06,1.709778e-06,5.068966e-13,1.808893e-02,1.268511e-04,0.025333,0.006667
2,5,1.338428e-06,1.338428e-06,1.806996e-13,1.265925e-04,4.156611e-21,0.002667,0.000000
3,8,3.151632e-12,3.151632e-12,1.203761e-20,3.469336e-08,9.165226e-33,0.000667,0.000000
4,10,1.067086e-07,1.067086e-07,2.091126e-15,3.574779e-03,9.646847e-08,0.000000,0.001333
...,...,...,...,...,...,...,...,...
50221,128584,1.563736e-05,1.563736e-05,1.098843e-13,1.584358e-02,2.698226e-02,0.035667,0.019667
50222,128588,1.066246e-03,1.066246e-03,2.659870e-09,1.088376e-01,2.300842e-04,0.094667,0.090000
50223,128591,2.033639e-04,2.033639e-04,8.762505e-14,3.241256e-01,8.933856e-01,0.152000,0.080778
50224,128593,5.304777e-03,5.304777e-03,1.808059e-11,3.399291e-01,1.442395e-04,0.101333,0.108667


In [47]:
meta_df_test_X = meta_df_test[['xgb', 'lgbm', 'log_reg', 'BernoulliNB', 'RF', 'ExtraTrees']]
meta_df_test_X

Unnamed: 0,xgb,lgbm,log_reg,BernoulliNB,RF,ExtraTrees
0,1.900340e-01,5.127886e-06,5.933143e-01,4.138579e-05,0.214667,0.196000
1,1.709778e-06,5.068966e-13,1.808893e-02,1.268511e-04,0.025333,0.006667
2,1.338428e-06,1.806996e-13,1.265925e-04,4.156611e-21,0.002667,0.000000
3,3.151632e-12,1.203761e-20,3.469336e-08,9.165226e-33,0.000667,0.000000
4,1.067086e-07,2.091126e-15,3.574779e-03,9.646847e-08,0.000000,0.001333
...,...,...,...,...,...,...
50221,1.563736e-05,1.098843e-13,1.584358e-02,2.698226e-02,0.035667,0.019667
50222,1.066246e-03,2.659870e-09,1.088376e-01,2.300842e-04,0.094667,0.090000
50223,2.033639e-04,8.762505e-14,3.241256e-01,8.933856e-01,0.152000,0.080778
50224,5.304777e-03,1.808059e-11,3.399291e-01,1.442395e-04,0.101333,0.108667


## Training a LGBM over this

In [48]:
%%time


param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.1],
    'max_depth': [5]
    
}
lgbm = LGB.LGBMClassifier(objective='binary', nthreads=-1, seed=27, verbosity=2, is_imbalance=True, metric='auc')

clf = GridSearchCV(lgbm, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(meta_df_X, meta_df_Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=0.9999963427277526, total=   8.3s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=0.9999993639526525, total=   9.5s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=1.0, total=  11.2s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   12.4s finished


CPU times: user 3.29 s, sys: 211 ms, total: 3.5 s
Wall time: 16.8 s


In [49]:
roc_auc_score( meta_df_Y.values,clf.predict_proba(meta_df_X)[:,1])

0.9999996819763264

In [50]:
sum(clf.predict(meta_df_test_X))

117

### Saving the submission

In [51]:
c='final'
test['redemption_status'] =clf.predict_proba(meta_df_test_X)[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/meta_lgbm_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

File saved!


0        1.765791e-08
1        1.111059e-08
2        1.124907e-08
3        1.124908e-08
4        1.124907e-08
             ...     
50221    1.103801e-08
50222    1.110983e-08
50223    1.106066e-08
50224    1.110931e-08
50225    1.124908e-08
Name: redemption_status, Length: 50226, dtype: float64

In [52]:
test[test['redemption_status'] > 0.5]

Unnamed: 0,campaign_id,coupon_id,customer_id,id,is_test,redemption_status,campaign_type,campaign_start_date,campaign_end_date,coupon_available_on_item_ids,customer_age_range,customer_marital_status,is_customer_rented,customer_family_size,customer_no_of_children,customer_income_bracket,is_common_demographic_info_available_for_customer,no_of_items_coupon_is_available_for,is_coupon_available_for_multiple_item_ids,no_of_categories_of_items_available_under_coupon,coupon_available_on_category_1,coupon_available_on_category_2,coupon_available_on_brand_1,coupon_available_on_brand_2,coupon_more_focussed_on_brand_type,popularity_of_category1,popularity_of_category2,popularity_of_brand1,popularity_of_brand2,mean_discount_on_category1,mean_discount_on_category2,mean_discount_on_brand1,mean_discount_on_brand2,no_of_times_customer_bought_category1,no_of_times_customer_bought_category2,no_of_times_customer_bought_brand1,no_of_times_customer_bought_brand2,no_of_times_customer_bought_category1_in_discount,no_of_times_customer_bought_category2_in_discount,no_of_times_customer_bought_brand1_in_discount,no_of_times_customer_bought_brand2_in_discount,has_customer_ever_bought_category1,has_customer_ever_bought_category2,has_customer_ever_bought_brand1,has_customer_ever_bought_brand2,has_customer_ever_bought_category1_in_discount,has_customer_ever_bought_category2_in_discount,has_customer_ever_bought_brand1_in_discount,has_customer_ever_bought_brand2_in_discount,average_selling_price_per_unit_of_category1_brand1_combination,average_selling_price_per_unit_of_category2_brand2_combination,average_transaction_amount_for_customer_per_item_of_category1,average_transaction_amount_for_customer_per_item_of_category2,is_customers_spending_habit_on_category1_in_favour_of_category1_brand1_combo,is_customers_spending_habit_on_category1_in_favour_of_category2_brand2_combo,campaign_duration_in_days,was_user_active_during_campaign_period,does_user_significantly_prefer_focussed_brand_type,does_user_prefer_category1_from_brand1,does_user_prefer_category1_from_brand2,does_user_prefer_any_of_brand1_brand2_for_category1,total_spends_of_customer,total_spends_category_of_customer,total_coupon_discount_claimed_by_customer,coupon_claiming_category_of_customer,most_bought_category_of_user,is_coupon_category1_same_as_customer_most_bought_category,is_brand1_same_as_most_favourite_brand_of_user_in_category1,is_brand2_same_as_second_most_favourite_brand_of_user_in_category1,is_brand2_same_as_most_favourite_brand_of_user_in_category1,is_brand1_same_as_second_most_favourite_brand_of_user_in_category1,are_the_brands_customers_favourite_brands_for_category1
361,20,671,1534,926,1,0.999990,1,2013-09-07,2013-11-16,"3463,27649,32873,1336,4313,16943,43894,18366,4...",3,1,0.0,3,1,5.0,1,69,1,1,3,0,676,-1,0,0.711382,0.000000,0.003105,0.000000,22.507003,0.000000,12.129654,0.000000,1238,0,17,0,900,0,10,0,1,0,1,0,1,0,1,0,72.914597,-1.0,68.685689,0.000000,0,1,70,0,1,0,0,0,187362.68,4,1194.24,4,1,1,0,0,0,0,0
667,19,689,389,1704,1,1.000000,1,2013-08-26,2013-09-27,"8610,33329,69233,24211,30016,33957,43057,17023...",2,1,0.0,4,2,6.0,1,72,1,1,6,0,686,-1,0,0.117204,0.000000,0.015547,0.000000,11.490816,0.000000,13.093350,0.000000,246,0,26,0,74,0,21,0,1,0,1,0,1,0,1,0,217.065908,-1.0,115.700294,0.000000,0,1,32,0,1,0,0,0,131248.71,3,1904.21,4,1,0,0,0,0,0,0
790,18,31,1070,1978,1,0.999957,0,2013-08-10,2013-10-04,"33596,41939,42796,56157,36896,71488,40715,3257...",5,0,0.0,1,0,6.0,1,4872,1,3,3,4,56,133,0,0.711382,0.072634,0.264804,0.006412,22.507003,27.890491,18.967506,31.657791,650,44,164,7,356,20,104,3,1,1,1,1,1,1,1,1,50.601550,-1.0,88.683308,128.038413,1,1,55,0,1,0,0,0,162285.14,4,3979.38,5,1,1,1,0,0,0,1
2085,18,869,1208,5224,1,0.608645,0,2013-08-10,2013-10-04,"1124,1185,39835,46911,3514,1033,45271,56179,40...",5,1,0.0,2,0,9.0,1,71,1,1,3,0,1075,-1,0,0.711382,0.000000,0.003300,0.000000,22.507003,0.000000,37.102000,0.000000,1009,0,41,0,412,0,36,0,1,0,1,0,1,0,1,0,108.805335,-1.0,90.223604,0.000000,0,1,55,0,1,0,0,0,272845.96,5,5923.47,5,1,1,0,0,0,0,0
2645,19,456,1341,6577,1,0.999684,1,2013-08-26,2013-09-27,"72024,21441,39789,37849,38877,15533,17648,3701...",2,1,0.0,4,2,5.0,1,33,1,1,6,0,158,-1,0,0.117204,0.000000,0.006479,0.000000,11.490816,0.000000,8.537463,0.000000,373,0,175,0,118,0,73,0,1,0,1,0,1,0,1,0,36.185270,-1.0,69.148016,0.000000,1,1,32,0,0,1,0,1,178909.07,4,2493.36,5,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47746,18,526,864,122098,1,0.999999,0,2013-08-10,2013-10-04,"72126,38808,1022,2153,43361,20805,35124,32462,...",5,0,0.0,1,0,5.0,1,46,1,1,3,0,278,-1,0,0.711382,0.000000,0.018100,0.000000,22.507003,0.000000,24.820366,0.000000,1257,0,77,0,786,0,62,0,1,0,1,0,1,0,1,0,72.980151,-1.0,72.945636,0.000000,0,1,55,0,1,0,0,0,249159.25,5,3726.15,5,1,1,0,1,0,1,1
47770,18,705,1563,122161,1,0.999677,0,2013-08-10,2013-10-04,"49007,31215,73066,49430,50633,49698,43585,2017...",-1,-1,-1.0,-1,-1,-1.0,0,146,1,2,3,5,714,-1,0,0.711382,0.002414,0.011422,0.000000,22.507003,2.868119,24.554538,0.000000,659,0,24,0,404,0,19,0,1,0,1,0,1,0,1,0,89.589351,-1.0,93.356435,0.000000,1,1,55,0,1,0,0,0,101664.08,2,1079.29,4,1,1,0,0,0,0,0
48146,18,468,1506,123136,1,1.000000,0,2013-08-10,2013-10-04,"42980,41399,1358,68357,12653,25000,51495,33821...",4,0,0.0,1,0,5.0,1,115,1,1,3,0,172,-1,0,0.711382,0.000000,0.011728,0.000000,22.507003,0.000000,32.479936,0.000000,652,0,7,0,463,0,7,0,1,0,1,0,1,0,1,0,87.438256,-1.0,42.485068,0.000000,0,1,55,0,0,0,0,0,79473.74,2,1659.91,4,1,1,0,0,0,0,0
48503,16,688,1441,124134,1,0.999684,1,2013-07-15,2013-08-16,"8081,35408,70765,61082,45494,37131,61051,5988,...",3,1,0.0,3,1,2.0,1,47,1,1,6,0,686,-1,0,0.117204,0.000000,0.015547,0.000000,11.490816,0.000000,13.093350,0.000000,631,0,74,0,186,0,58,0,1,0,1,0,1,0,1,0,217.065908,-1.0,131.990230,0.000000,0,1,32,0,1,0,0,0,254683.03,5,3073.24,5,1,0,0,0,0,0,0


## Training a XGB over this

In [53]:
%%time


param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.1],
    'max_depth': [5]
    
}
lgbm = LGB.LGBMClassifier(objective='binary', nthreads=-1, seed=27, verbosity=2, is_imbalance=True, metric='auc')

clf = GridSearchCV(lgbm, param_grid=param_grid, cv=skf, n_jobs=-1, verbose=3, scoring='roc_auc').fit(meta_df_X, meta_df_Y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=0.9999963427277527, total=   7.6s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=0.9999993639526525, total=   8.1s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, score=1.0, total=   9.6s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   10.6s finished


CPU times: user 3.24 s, sys: 124 ms, total: 3.36 s
Wall time: 14.4 s


In [54]:
roc_auc_score( meta_df_Y.values,clf.predict_proba(meta_df_X)[:,1])

0.9999996819763264

In [55]:
sum(clf.predict(meta_df_test_X))

117

### Saving the submission

In [56]:
c='final'
test['redemption_status'] =clf.predict_proba(meta_df_test_X)[:,1]
submission = test.loc[:,['id', 'redemption_status']]
name = "../data/submissions/meta_xgb_"+str(c)+".csv"
submission.to_csv( name,index=False)
print("File saved!")
test['redemption_status']

File saved!


0        1.765791e-08
1        1.111059e-08
2        1.124907e-08
3        1.124908e-08
4        1.124907e-08
             ...     
50221    1.103801e-08
50222    1.110983e-08
50223    1.106066e-08
50224    1.110931e-08
50225    1.124908e-08
Name: redemption_status, Length: 50226, dtype: float64

# Creating an ensemble of XGB, LGB, Meta XGB, Meta LGBM, RF and Extra Trees

In [57]:
xgb = pd.read_csv("../data/submissions/submission_after_modeling_xgbfinal.csv")
lgbm = pd.read_csv("../data/submissions/submission_after_modeling_lgb_final.csv")
meta_xgb = pd.read_csv("../data/submissions/meta_xgb_final.csv")
meta_lgbm = pd.read_csv("../data/submissions/meta_lgbm_final.csv")
RF = pd.read_csv("../data/submissions/submission_after_modeling_RF_final.csv")
ExtraTrees = pd.read_csv("../data/submissions/submission_after_modeling_ExtraTrees_final.csv")



In [58]:
ensembled = meta_xgb.copy()

In [59]:
ensembled['redemption_status'] = ensembled['redemption_status']*0.30
ensembled['redemption_status'] = ensembled['redemption_status']+meta_lgbm['redemption_status']*0.30+xgb['redemption_status']*0.1+lgbm['redemption_status']*0.1+RF['redemption_status']*0.1+ExtraTrees['redemption_status']*0.1

In [60]:
ensembled[ensembled.redemption_status > 0.5]

Unnamed: 0,id,redemption_status
361,926,0.807855
667,1704,0.940069
790,1978,0.738516
2085,5224,0.519196
2645,6577,0.864933
...,...,...
46605,119197,0.973712
47746,122098,0.726629
47770,122161,0.748001
48146,123136,0.944556


### Saving the ensembled solution

In [61]:
c='final'
name = "../data/submissions/submission_stacked_and_ensembled_"+str(c)+".csv"
ensembled.to_csv( name,index=False)
print("File saved!")

File saved!
