In [31]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

import itertools
import joblib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
columns = [
    # unique id
    u'reservation_id', 
    # dates
    u'booking_date', u'checkin_date', u'checkout_date',
    # traveller related info
    u'memberid', u'numberofadults', u'numberofchildren', u'total_pax', u'persontravellingid', u'member_age_buckets', 
    u'state_code_residence',
    # resort info
    u'resort_id', u'resort_region_code', u'resort_type_code', u'state_code_resort', u'cluster_code',
    # booking info
    u'channel_code', u'booking_type_code', u'room_type_booked_code', u'roomnights', u'season_holidayed_code', 
    u'reservationstatusid_code', 
    # product purchased
    u'main_product_code',
    # target 
    u'amount_spent_per_room_night_scaled'
]


In [3]:
train = pd.read_csv('train.csv')
print train.shape
test = pd.read_csv('test.csv')
print test.shape

(341424, 24)
(146765, 23)


In [4]:
df = pd.concat([train, test], axis=0, sort=False)
df.reset_index(drop=True, inplace=True)
df.shape

(488189, 24)

In [5]:
df.booking_date = pd.to_datetime(df.booking_date, format='%d/%m/%y')
df.checkin_date = pd.to_datetime(df.checkin_date, format='%d/%m/%y')
df.checkout_date = pd.to_datetime(df.checkout_date, format='%d/%m/%y')

In [6]:
df = df[columns]
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346


In [7]:
df['booking_in_advance_days'] = (df.checkin_date - df.booking_date).dt.days
df['booking_roomnights'] = (df.checkout_date - df.checkin_date).dt.days
df['total_persons_travelling'] = df.numberofadults + df.numberofchildren

In [8]:
# roomnights, numberofadults, numberofchildren, total_pax

In [9]:
def create_date_variables(df, date_key, week=False, month=False, year=False, dayofweek=False, dayofmonth=False, dayofyear=False):
    if week:
        df['{}_{}'.format(date_key, 'week')] = df[date_key].dt.week
    if month:
        df['{}_{}'.format(date_key, 'month')] = df[date_key].dt.month
    if year:
        df['{}_{}'.format(date_key, 'year')] = df[date_key].dt.year
    if dayofweek:
        df['{}_{}'.format(date_key, 'dayofweek')] = df[date_key].dt.dayofweek
    if dayofmonth:
        df['{}_{}'.format(date_key, 'dayofmonth')] = df[date_key].dt.day
    if dayofyear:
        df['{}_{}'.format(date_key, 'dayofyear')] = df[date_key].dt.dayofyear
    return df 

def num_div_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_div_{}".format(col1, col2)] = (df[col1]/df[col2]).replace([np.inf, -np.inf, np.nan], 0)
    return df

def num_dif_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_dif_{}".format(col1, col2)] = df[col1] - df[col2]
    return df
    
def cat_interactions(df, cat_cols):
    for col1, col2 in itertools.combinations(cat_cols, 2):
        print '{}_{}'.format(col1, col2) 
        df['{}_{}'.format(col1, col2)] = df[col1].astype(str) + '_' + df[col2].astype(str)
    return df

    
def create_group_variables(df, group_col, num_col):
    #     count, mean, std, min, max, x-mean, median, x-median, iqr 
    if isinstance(group_col, str):
        group_col = [group_col]
    if isinstance(num_col, str):
        num_col = [num_col]
        
    df_group = df.groupby(group_col)
    
    functions = {
        'mean':'mean',
        'max':'max',
        'min':'min',
        'std':'std',
        'median':'median',
#         'min-max': lambda x: x.max()- x.min()
    }
    final_columns, apply_dict = [], {}
    prefix = 'GRP_' + '_'.join(map(str, group_col))
    for col in num_col:
        methods = []            
        for key, val in functions.iteritems():
            methods.append(val)
            final_columns.append("{}#{}#{}".format(prefix , col, key)) 
        apply_dict[col] = methods
        
    df_var = df_group.agg(apply_dict)
    df_var.columns = final_columns
    df_var.reset_index(inplace=True)
    
    df = df.merge(df_var, how='left', on=group_col)

    for col in num_col:
        df["{}#{}#{}".format(prefix , col, 'min_mean')] = df[col] - df["{}#{}#{}".format(prefix , col, 'mean')]
        
    return df


In [10]:
# df.head()
df = create_date_variables(df, 'checkin_date',  week=True, month=True, year=True, dayofweek=True, dayofmonth=True, dayofyear=True)
df = create_date_variables(df, 'checkout_date',  week=True, dayofweek=True)
df = create_date_variables(df, 'booking_date',  week=True, month=True, year=True, dayofyear=True)


# df = create_date_variables(df, 'checkout_date')
# df = create_date_variables(df, 'booking_date')

In [11]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear'],
      dtype='object')

In [12]:
df = cat_interactions(df, ['resort_id', 'checkin_date'])
df = cat_interactions(df, ['resort_id', 'checkout_date'])
df = cat_interactions(df, ['resort_id', 'booking_date'])


df = cat_interactions(df, ['resort_id', 'checkin_date_year'])
df = cat_interactions(df, ['resort_id', 'checkin_date_month'])
df = cat_interactions(df, ['resort_id', 'checkin_date_week'])
df = cat_interactions(df, ['resort_id', 'checkin_date_dayofweek'])
df = cat_interactions(df, ['resort_id', 'checkin_date_dayofyear'])


df = cat_interactions(df, ['resort_id', 'channel_code'])
df = cat_interactions(df, ['resort_id', 'booking_type_code'])
df = cat_interactions(df, ['resort_id', 'reservationstatusid_code'])
df = cat_interactions(df, ['resort_id', 'resort_type_code'])
df = cat_interactions(df, ['resort_id', 'cluster_code'])
df = cat_interactions(df, ['resort_id', 'room_type_booked_code'])


df = cat_interactions(df, ['persontravellingid', 'member_age_buckets', 'state_code_residence', 'state_code_resort'])
df = cat_interactions(df, ['resort_id', 'memberid'])

df = cat_interactions(df, ['memberid', 'channel_code'])
df = cat_interactions(df, ['memberid', 'booking_type_code'])
df = cat_interactions(df, ['memberid', 'reservationstatusid_code'])
df = cat_interactions(df, ['memberid', 'resort_type_code'])
df = cat_interactions(df, ['memberid', 'cluster_code'])
df = cat_interactions(df, ['memberid', 'room_type_booked_code'])

df = cat_interactions(df, ['memberid', 'checkin_date'])
df = cat_interactions(df, ['memberid', 'checkout_date'])
df = cat_interactions(df, ['memberid', 'booking_date'])

df = cat_interactions(df, ['memberid', 'checkin_date_year'])
df = cat_interactions(df, ['memberid', 'checkin_date_month'])
df = cat_interactions(df, ['memberid', 'checkin_date_week'])
df = cat_interactions(df, ['memberid', 'checkin_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'checkin_date_dayofyear'])

df = cat_interactions(df, ['resort_id', 'booking_roomnights'])
df = cat_interactions(df, ['memberid', 'booking_roomnights'])

resort_id_checkin_date
resort_id_checkout_date
resort_id_booking_date
resort_id_checkin_date_year
resort_id_checkin_date_month
resort_id_checkin_date_week
resort_id_checkin_date_dayofweek
resort_id_checkin_date_dayofyear
resort_id_channel_code
resort_id_booking_type_code
resort_id_reservationstatusid_code
resort_id_resort_type_code
resort_id_cluster_code
resort_id_room_type_booked_code
persontravellingid_member_age_buckets
persontravellingid_state_code_residence
persontravellingid_state_code_resort
member_age_buckets_state_code_residence
member_age_buckets_state_code_resort
state_code_residence_state_code_resort
resort_id_memberid
memberid_channel_code
memberid_booking_type_code
memberid_reservationstatusid_code
memberid_resort_type_code
memberid_cluster_code
memberid_room_type_booked_code
memberid_checkin_date
memberid_checkout_date
memberid_booking_date
memberid_checkin_date_year
memberid_checkin_date_month
memberid_checkin_date_week
memberid_checkin_date_dayofweek
memberid_checkin_d

In [13]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear',
       u'resort_id_checkin

In [14]:
df = num_div_interactions(df, num_cols=['numberofadults', 'numberofchildren','total_pax',  
                                        'roomnights', 'booking_in_advance_days', 'booking_roomnights', 
                                        'total_persons_travelling'])

df = num_dif_interactions(df, num_cols=[ 'roomnights', 'booking_roomnights'])
df = num_dif_interactions(df, num_cols=[ 'total_persons_travelling', 'total_pax'])
df = num_dif_interactions(df, num_cols=[ 'checkout_date_week', 'checkin_date_week', 'booking_date_week'])
df.shape
                                        

(488189, 102)

In [15]:
for c in [u'booking_date', u'checkin_date', u'checkout_date', u'memberid', u'resort_id',
        u'resort_id_checkin_date', u'resort_id_checkout_date',
         ]:
    print c
    df = create_group_variables(df, group_col=[c], num_col=[
        u'numberofadults', u'numberofchildren', u'total_pax',u'roomnights', u'booking_in_advance_days',
        'total_persons_travelling', 'booking_roomnights', 
        u'total_persons_travelling_dif_total_pax', u'roomnights_dif_booking_roomnights'
    ])
    print df.shape

booking_date
(488189, 156)
checkin_date
(488189, 210)
checkout_date
(488189, 264)
memberid
(488189, 318)
resort_id
(488189, 372)
resort_id_checkin_date
(488189, 426)
resort_id_checkout_date
(488189, 480)


In [16]:
df.shape
df.head()

(488189, 480)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_checkin_date_year,resort_id_checkin_date_month,resort_id_checkin_date_week,resort_id_checkin_date_dayofweek,resort_id_checkin_date_dayofyear,resort_id_channel_code,resort_id_booking_type_code,resort_id_reservationstatusid_code,...,GRP_resort_id_checkout_date#numberofadults#mean,GRP_resort_id_checkout_date#numberofchildren#std,GRP_resort_id_checkout_date#numberofchildren#max,GRP_resort_id_checkout_date#numberofchildren#min,GRP_resort_id_checkout_date#numberofchildren#median,GRP_resort_id_checkout_date#numberofchildren#mean,GRP_resort_id_checkout_date#total_pax#std,GRP_resort_id_checkout_date#total_pax#max,GRP_resort_id_checkout_date#total_pax#min,GRP_resort_id_checkout_date#total_pax#median,GRP_resort_id_checkout_date#total_pax#mean,GRP_resort_id_checkout_date#roomnights#std,GRP_resort_id_checkout_date#roomnights#max,GRP_resort_id_checkout_date#roomnights#min,GRP_resort_id_checkout_date#roomnights#median,GRP_resort_id_checkout_date#roomnights#mean,GRP_resort_id_checkout_date#booking_in_advance_days#std,GRP_resort_id_checkout_date#booking_in_advance_days#max,GRP_resort_id_checkout_date#booking_in_advance_days#min,GRP_resort_id_checkout_date#booking_in_advance_days#median,GRP_resort_id_checkout_date#booking_in_advance_days#mean,GRP_resort_id_checkout_date#total_persons_travelling#std,GRP_resort_id_checkout_date#total_persons_travelling#max,GRP_resort_id_checkout_date#total_persons_travelling#min,GRP_resort_id_checkout_date#total_persons_travelling#median,GRP_resort_id_checkout_date#total_persons_travelling#mean,GRP_resort_id_checkout_date#booking_roomnights#std,GRP_resort_id_checkout_date#booking_roomnights#max,GRP_resort_id_checkout_date#booking_roomnights#min,GRP_resort_id_checkout_date#booking_roomnights#median,GRP_resort_id_checkout_date#booking_roomnights#mean,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#std,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#max,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#min,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#median,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#mean,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#std,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#max,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#min,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#median,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#mean,GRP_resort_id_checkout_date#numberofadults#min_mean,GRP_resort_id_checkout_date#numberofchildren#min_mean,GRP_resort_id_checkout_date#total_pax#min_mean,GRP_resort_id_checkout_date#roomnights#min_mean,GRP_resort_id_checkout_date#booking_in_advance_days#min_mean,GRP_resort_id_checkout_date#total_persons_travelling#min_mean,GRP_resort_id_checkout_date#booking_roomnights#min_mean,GRP_resort_id_checkout_date#total_persons_travelling_dif_total_pax#min_mean,GRP_resort_id_checkout_date#roomnights_dif_booking_roomnights#min_mean
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,...,-0.8,1.140175,5,2,4.0,3.6,0.547723,1,0,0.0,0.4,0.0,0,0,0.0,0.0,37.19543,93,0,15.0,29.0,0.83666,3,1,2.0,1.8,1.095445,4,2,2.0,2.8,0.83666,3,1,2.0,2.2,1.095445,4,2,2.0,2.8,2.8,-3.6,2.6,1.0,-29.0,0.2,-1.8,-3.2,-2.8
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,...,0.5,0.755929,4,2,2.0,2.5,1.511858,4,0,0.0,1.0,0.707107,2,0,0.0,0.25,32.780656,102,15,92.0,77.5,1.069045,5,2,3.0,3.0,1.035098,4,2,2.0,2.75,1.85164,8,2,3.5,4.0,1.069045,4,2,3.0,3.0,1.5,-2.5,1.0,4.75,0.5,-1.0,2.25,-4.0,-3.0
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,...,0.571429,0.48795,3,2,2.0,2.285714,1.253566,3,0,0.0,0.714286,0.377964,1,0,0.0,0.142857,47.088569,93,4,92.0,55.0,0.57735,4,2,3.0,3.0,0.95119,4,2,2.0,2.714286,1.112697,6,3,3.0,3.714286,0.899735,4,2,3.0,2.857143,1.428571,-2.285714,1.285714,3.857143,-51.0,-1.0,1.285714,-3.714286,-2.857143
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,...,-0.074074,1.217161,6,2,3.0,3.407407,2.292686,8,0,0.0,1.222222,0.733799,2,0,0.0,0.333333,30.394744,103,0,41.0,43.925926,1.983551,8,1,3.0,3.62963,1.240347,6,2,3.0,3.0,2.970414,14,1,4.0,4.851852,1.176697,6,2,3.0,3.333333,2.074074,-1.407407,0.777778,4.666667,-3.925926,0.37037,2.0,-2.851852,-3.333333
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,...,0.361702,1.062784,6,1,3.0,2.851064,1.829201,8,0,0.0,0.957447,0.649486,2,0,0.0,0.276596,41.120837,106,0,19.0,45.893617,1.392792,6,1,3.0,3.12766,1.258091,6,1,2.0,2.93617,2.611261,14,1,3.0,4.085106,1.249977,6,1,3.0,3.212766,1.638298,-2.851064,1.042553,4.723404,57.106383,-1.12766,2.06383,-4.085106,-3.212766


In [17]:
unique_id = 'reservation_id'

date_cols = []

target_col = 'amount_spent_per_room_night_scaled'

cat_cols = [
    'memberid',
    'persontravellingid', 'member_age_buckets', 'state_code_residence', 
    'resort_id', 'resort_region_code', 'resort_type_code', 'state_code_resort', 'cluster_code', 'channel_code',
    'booking_type_code', 'room_type_booked_code', 'season_holidayed_code', 'reservationstatusid_code', 'main_product_code',
    u'checkin_date', u'checkout_date', u'booking_date',
    
 u'resort_id_checkin_date', u'resort_id_checkout_date',
u'resort_id_booking_date', u'resort_id_checkin_date_year',
u'resort_id_checkin_date_month', u'resort_id_checkin_date_week',
u'resort_id_checkin_date_dayofweek',
u'resort_id_checkin_date_dayofyear', u'resort_id_channel_code',
u'resort_id_booking_type_code', u'resort_id_reservationstatusid_code',
u'resort_id_resort_type_code', u'resort_id_cluster_code',
u'resort_id_room_type_booked_code',
u'persontravellingid_member_age_buckets',
u'persontravellingid_state_code_residence',
u'persontravellingid_state_code_resort',
u'member_age_buckets_state_code_residence',
u'member_age_buckets_state_code_resort',
u'state_code_residence_state_code_resort', u'resort_id_memberid',
u'memberid_channel_code', u'memberid_booking_type_code',
u'memberid_reservationstatusid_code', u'memberid_resort_type_code',
u'memberid_cluster_code', u'memberid_room_type_booked_code',
u'memberid_checkin_date', u'memberid_checkout_date',
u'memberid_booking_date', u'memberid_checkin_date_year',
u'memberid_checkin_date_month', u'memberid_checkin_date_week',
u'memberid_checkin_date_dayofweek', u'memberid_checkin_date_dayofyear',
u'resort_id_booking_roomnights', u'memberid_booking_roomnights'
]

columns_to_drop = []


In [18]:
from ml_modules.encoding import FreqeuncyEncoding

In [19]:
fE = FreqeuncyEncoding(categorical_columns=cat_cols, return_df=True)

In [20]:
%%time
df1 = fE.fit_transform(df)

CPU times: user 1min 42s, sys: 1min 23s, total: 3min 6s
Wall time: 1min 22s


In [21]:
df1.shape

(488189, 480)

In [22]:
df1.drop(columns=date_cols +[unique_id, target_col] + columns_to_drop, inplace=True)

In [23]:
x_train, y_train = df1[:341424].values, train[target_col].values
x_train.shape, y_train.shape

((341424, 478), (341424,))

In [24]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [25]:
from ml_modules.custom_estimator import Estimator
from ml_modules.custom_fold_generator import FoldScheme

In [26]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

####  lgb numleaves 100 k5

In [45]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 100, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=5)

In [46]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00524	valid_0's l2: 1.01052	valid_1's rmse: 0.997861	valid_1's l2: 0.995727
[200]	valid_0's rmse: 0.984361	valid_0's l2: 0.968968	valid_1's rmse: 0.973021	valid_1's l2: 0.946769
[300]	valid_0's rmse: 0.976951	valid_0's l2: 0.954433	valid_1's rmse: 0.961448	valid_1's l2: 0.924383
[400]	valid_0's rmse: 0.973205	valid_0's l2: 0.947129	valid_1's rmse: 0.953339	valid_1's l2: 0.908855
[500]	valid_0's rmse: 0.971103	valid_0's l2: 0.943042	valid_1's rmse: 0.946704	valid_1's l2: 0.896249
[600]	valid_0's rmse: 0.969864	valid_0's l2: 0.940636	valid_1's rmse: 0.940886	valid_1's l2: 0.885267
[700]	valid_0's rmse: 0.968987	valid_0's l2: 0.938935	valid_1's rmse: 0.935489	valid_1's l2: 0.875141
[800]	valid_0's rmse: 0.968354	valid_0's l2: 0.937709	valid_1's rmse: 0.930324	valid_1's l2: 0.865503
[900]	valid_0's rmse: 0.96791	valid_0's l2: 0.936851	valid_1's rmse: 0.925548	valid_1's l2: 0.85664
[1000]	valid_0's rmse: 

[300]	valid_0's rmse: 0.978011	valid_0's l2: 0.956506	valid_1's rmse: 0.960883	valid_1's l2: 0.923295
[400]	valid_0's rmse: 0.97449	valid_0's l2: 0.949631	valid_1's rmse: 0.952702	valid_1's l2: 0.907641
[500]	valid_0's rmse: 0.972564	valid_0's l2: 0.945881	valid_1's rmse: 0.946053	valid_1's l2: 0.895016
[600]	valid_0's rmse: 0.971395	valid_0's l2: 0.943608	valid_1's rmse: 0.940262	valid_1's l2: 0.884094
[700]	valid_0's rmse: 0.970541	valid_0's l2: 0.941949	valid_1's rmse: 0.934838	valid_1's l2: 0.873922
[800]	valid_0's rmse: 0.969908	valid_0's l2: 0.940722	valid_1's rmse: 0.929699	valid_1's l2: 0.864341
[900]	valid_0's rmse: 0.96945	valid_0's l2: 0.939833	valid_1's rmse: 0.924891	valid_1's l2: 0.855423
[1000]	valid_0's rmse: 0.969235	valid_0's l2: 0.939417	valid_1's rmse: 0.920356	valid_1's l2: 0.847056
[1100]	valid_0's rmse: 0.969025	valid_0's l2: 0.939009	valid_1's rmse: 0.916075	valid_1's l2: 0.839193
[1200]	valid_0's rmse: 0.968912	valid_0's l2: 0.93879	valid_1's rmse: 0.911961	val

In [47]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9650585884109836,
 0.0025375069624602223,
 [0.9669465307326046,
  0.9648962398870645,
  0.9610235445438625,
  0.9640160804302255,
  0.9684105464611609])

In [49]:
joblib.dump(est, 'lgb-480-lve100-k5.pkl')

['lgb-480-lve100-k5.pkl']

In [50]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [51]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve100-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve100-k5-train.csv', index=False)

####  lgb numleaves 48 k5

In [36]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 48, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=5)

In [37]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00983	valid_0's l2: 1.01976	valid_1's rmse: 1.0042	valid_1's l2: 1.00841
[200]	valid_0's rmse: 0.98829	valid_0's l2: 0.976717	valid_1's rmse: 0.98111	valid_1's l2: 0.962577
[300]	valid_0's rmse: 0.979848	valid_0's l2: 0.960101	valid_1's rmse: 0.9711	valid_1's l2: 0.943035
[400]	valid_0's rmse: 0.975585	valid_0's l2: 0.951766	valid_1's rmse: 0.965013	valid_1's l2: 0.931249
[500]	valid_0's rmse: 0.973017	valid_0's l2: 0.946762	valid_1's rmse: 0.960519	valid_1's l2: 0.922597
[600]	valid_0's rmse: 0.971475	valid_0's l2: 0.943763	valid_1's rmse: 0.956915	valid_1's l2: 0.915686
[700]	valid_0's rmse: 0.970381	valid_0's l2: 0.94164	valid_1's rmse: 0.953695	valid_1's l2: 0.909533
[800]	valid_0's rmse: 0.96959	valid_0's l2: 0.940105	valid_1's rmse: 0.950744	valid_1's l2: 0.903914
[900]	valid_0's rmse: 0.968992	valid_0's l2: 0.938946	valid_1's rmse: 0.947951	valid_1's l2: 0.898611
[1000]	valid_0's rmse: 0.96855

[1500]	valid_0's rmse: 0.960721	valid_0's l2: 0.922984	valid_1's rmse: 0.935536	valid_1's l2: 0.875227
[1600]	valid_0's rmse: 0.960666	valid_0's l2: 0.922879	valid_1's rmse: 0.933453	valid_1's l2: 0.871335
[1700]	valid_0's rmse: 0.960617	valid_0's l2: 0.922785	valid_1's rmse: 0.931418	valid_1's l2: 0.867539
[1800]	valid_0's rmse: 0.960544	valid_0's l2: 0.922644	valid_1's rmse: 0.929423	valid_1's l2: 0.863827
[1900]	valid_0's rmse: 0.960561	valid_0's l2: 0.922678	valid_1's rmse: 0.927448	valid_1's l2: 0.86016
[2000]	valid_0's rmse: 0.960536	valid_0's l2: 0.922629	valid_1's rmse: 0.925471	valid_1's l2: 0.856496
[2100]	valid_0's rmse: 0.960537	valid_0's l2: 0.922631	valid_1's rmse: 0.923517	valid_1's l2: 0.852884
[2200]	valid_0's rmse: 0.960535	valid_0's l2: 0.922628	valid_1's rmse: 0.921641	valid_1's l2: 0.849422
Early stopping, best iteration is:
[2045]	valid_0's rmse: 0.960511	valid_0's l2: 0.922581	valid_1's rmse: 0.924576	valid_1's l2: 0.854841
Training until validation scores don't 

In [38]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9648628181531865,
 0.0026018158207809705,
 [0.9667743478930488,
  0.964811701885951,
  0.9605108666738996,
  0.9640902578338101,
  0.968126916479223])

In [39]:
joblib.dump(est, 'lgb-480-lve48-k5.pkl')

['lgb-480-lve48-k5.pkl']

In [40]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [41]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve48-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve48-k5-train.csv', index=False)

####  lgb numleaves 48 k10

In [27]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 48, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=10)

In [28]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.01224	valid_0's l2: 1.02464	valid_1's rmse: 1.00436	valid_1's l2: 1.00873
[200]	valid_0's rmse: 0.990678	valid_0's l2: 0.981443	valid_1's rmse: 0.981502	valid_1's l2: 0.963346
[300]	valid_0's rmse: 0.982233	valid_0's l2: 0.964781	valid_1's rmse: 0.971738	valid_1's l2: 0.944274
[400]	valid_0's rmse: 0.977671	valid_0's l2: 0.955841	valid_1's rmse: 0.965816	valid_1's l2: 0.9328
[500]	valid_0's rmse: 0.975062	valid_0's l2: 0.950745	valid_1's rmse: 0.961515	valid_1's l2: 0.924511
[600]	valid_0's rmse: 0.973476	valid_0's l2: 0.947656	valid_1's rmse: 0.958111	valid_1's l2: 0.917978
[700]	valid_0's rmse: 0.972296	valid_0's l2: 0.94536	valid_1's rmse: 0.955097	valid_1's l2: 0.91221
[800]	valid_0's rmse: 0.971375	valid_0's l2: 0.943569	valid_1's rmse: 0.95232	valid_1's l2: 0.906913
[900]	valid_0's rmse: 0.970691	valid_0's l2: 0.94224	valid_1's rmse: 0.949758	valid_1's l2: 0.90204
[1000]	valid_0's rmse: 0.97023

[1700]	valid_0's rmse: 0.964711	valid_0's l2: 0.930668	valid_1's rmse: 0.933622	valid_1's l2: 0.87165
[1800]	valid_0's rmse: 0.964636	valid_0's l2: 0.930522	valid_1's rmse: 0.931813	valid_1's l2: 0.868276
[1900]	valid_0's rmse: 0.964569	valid_0's l2: 0.930394	valid_1's rmse: 0.930017	valid_1's l2: 0.864931
[2000]	valid_0's rmse: 0.96451	valid_0's l2: 0.93028	valid_1's rmse: 0.928222	valid_1's l2: 0.861596
[2100]	valid_0's rmse: 0.964481	valid_0's l2: 0.930223	valid_1's rmse: 0.926495	valid_1's l2: 0.858393
[2200]	valid_0's rmse: 0.96441	valid_0's l2: 0.930087	valid_1's rmse: 0.924771	valid_1's l2: 0.855202
[2300]	valid_0's rmse: 0.964339	valid_0's l2: 0.92995	valid_1's rmse: 0.923071	valid_1's l2: 0.852061
[2400]	valid_0's rmse: 0.964272	valid_0's l2: 0.929821	valid_1's rmse: 0.921369	valid_1's l2: 0.84892
[2500]	valid_0's rmse: 0.964228	valid_0's l2: 0.929735	valid_1's rmse: 0.919696	valid_1's l2: 0.84584
[2600]	valid_0's rmse: 0.964185	valid_0's l2: 0.929653	valid_1's rmse: 0.918037	

[3400]	valid_0's rmse: 0.958269	valid_0's l2: 0.918279	valid_1's rmse: 0.905647	valid_1's l2: 0.820196
[3500]	valid_0's rmse: 0.958306	valid_0's l2: 0.91835	valid_1's rmse: 0.904057	valid_1's l2: 0.817319
[3600]	valid_0's rmse: 0.958336	valid_0's l2: 0.918408	valid_1's rmse: 0.90248	valid_1's l2: 0.81447
Early stopping, best iteration is:
[3408]	valid_0's rmse: 0.958267	valid_0's l2: 0.918276	valid_1's rmse: 0.905522	valid_1's l2: 0.81997
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00375	valid_0's l2: 1.00752	valid_1's rmse: 1.00518	valid_1's l2: 1.0104
[200]	valid_0's rmse: 0.982361	valid_0's l2: 0.965033	valid_1's rmse: 0.982354	valid_1's l2: 0.965019
[300]	valid_0's rmse: 0.973921	valid_0's l2: 0.948523	valid_1's rmse: 0.97263	valid_1's l2: 0.946009
[400]	valid_0's rmse: 0.969534	valid_0's l2: 0.939997	valid_1's rmse: 0.96668	valid_1's l2: 0.93447
[500]	valid_0's rmse: 0.967053	valid_0's l2: 0.935191	valid_1's rmse: 0.962334	valid_1's l2: 0

[1500]	valid_0's rmse: 0.957558	valid_0's l2: 0.916918	valid_1's rmse: 0.938172	valid_1's l2: 0.880167
[1600]	valid_0's rmse: 0.957435	valid_0's l2: 0.916682	valid_1's rmse: 0.936256	valid_1's l2: 0.876576
[1700]	valid_0's rmse: 0.957318	valid_0's l2: 0.916459	valid_1's rmse: 0.934411	valid_1's l2: 0.873124
[1800]	valid_0's rmse: 0.957247	valid_0's l2: 0.916323	valid_1's rmse: 0.93259	valid_1's l2: 0.869724
[1900]	valid_0's rmse: 0.957162	valid_0's l2: 0.916159	valid_1's rmse: 0.930804	valid_1's l2: 0.866396
[2000]	valid_0's rmse: 0.95706	valid_0's l2: 0.915965	valid_1's rmse: 0.929029	valid_1's l2: 0.863094
[2100]	valid_0's rmse: 0.956996	valid_0's l2: 0.915841	valid_1's rmse: 0.927282	valid_1's l2: 0.859851
[2200]	valid_0's rmse: 0.956906	valid_0's l2: 0.915669	valid_1's rmse: 0.925521	valid_1's l2: 0.85659
[2300]	valid_0's rmse: 0.956884	valid_0's l2: 0.915628	valid_1's rmse: 0.923777	valid_1's l2: 0.853364
[2400]	valid_0's rmse: 0.956799	valid_0's l2: 0.915465	valid_1's rmse: 0.922

[2500]	valid_0's rmse: 0.972943	valid_0's l2: 0.946618	valid_1's rmse: 0.918818	valid_1's l2: 0.844227
[2600]	valid_0's rmse: 0.972919	valid_0's l2: 0.946572	valid_1's rmse: 0.917129	valid_1's l2: 0.841126
[2700]	valid_0's rmse: 0.972912	valid_0's l2: 0.946558	valid_1's rmse: 0.915488	valid_1's l2: 0.838118
[2800]	valid_0's rmse: 0.972878	valid_0's l2: 0.946491	valid_1's rmse: 0.913814	valid_1's l2: 0.835056
[2900]	valid_0's rmse: 0.972843	valid_0's l2: 0.946423	valid_1's rmse: 0.912158	valid_1's l2: 0.832033
[3000]	valid_0's rmse: 0.97284	valid_0's l2: 0.946418	valid_1's rmse: 0.910554	valid_1's l2: 0.829109
[3100]	valid_0's rmse: 0.97289	valid_0's l2: 0.946516	valid_1's rmse: 0.908959	valid_1's l2: 0.826207
[3200]	valid_0's rmse: 0.972799	valid_0's l2: 0.946337	valid_1's rmse: 0.907321	valid_1's l2: 0.823232
[3300]	valid_0's rmse: 0.972799	valid_0's l2: 0.946337	valid_1's rmse: 0.905751	valid_1's l2: 0.820384
[3400]	valid_0's rmse: 0.972836	valid_0's l2: 0.94641	valid_1's rmse: 0.904

In [29]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9640385177560402,
 0.004801121733570834,
 [0.9683171968660389,
  0.963444278622478,
  0.9641447557267104,
  0.9632503426307046,
  0.9582672080224637,
  0.9616439773114717,
  0.9698166517819894,
  0.956287882922335,
  0.9624463158824099,
  0.9727665677938])

In [32]:
joblib.dump(est, 'lgb-480-lve48-k10.pkl')

['lgb-480-lve48-k10.pkl']

In [33]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

  return np.mean(np.column_stack((est.predict(x) for est in self.fitted_models)), axis=1)


In [34]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve48-k10-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-480-lve48-k10-train.csv', index=False)