In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

import itertools
import joblib

In [2]:
columns = [
    # unique id
    u'reservation_id', 
    # dates
    u'booking_date', u'checkin_date', u'checkout_date',
    # traveller related info
    u'memberid', u'numberofadults', u'numberofchildren', u'total_pax', u'persontravellingid', u'member_age_buckets', 
    u'state_code_residence',
    # resort info
    u'resort_id', u'resort_region_code', u'resort_type_code', u'state_code_resort', u'cluster_code',
    # booking info
    u'channel_code', u'booking_type_code', u'room_type_booked_code', u'roomnights', u'season_holidayed_code', 
    u'reservationstatusid_code', 
    # product purchased
    u'main_product_code',
    # target 
    u'amount_spent_per_room_night_scaled'
]


In [3]:
train = pd.read_csv('train.csv')
print train.shape
test = pd.read_csv('test.csv')
print test.shape

(341424, 24)
(146765, 23)


In [4]:
df = pd.concat([train, test], axis=0, sort=False)
df.reset_index(drop=True, inplace=True)
df.shape

(488189, 24)

In [5]:
df.booking_date = pd.to_datetime(df.booking_date, format='%d/%m/%y')
df.checkin_date = pd.to_datetime(df.checkin_date, format='%d/%m/%y')
df.checkout_date = pd.to_datetime(df.checkout_date, format='%d/%m/%y')

In [6]:
df = df[columns]
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346


In [7]:
df['booking_in_advance_days'] = (df.checkin_date - df.booking_date).dt.days
df['booking_roomnights'] = (df.checkout_date - df.checkin_date).dt.days
df['total_persons_travelling'] = df.numberofadults + df.numberofchildren

In [8]:
# roomnights, numberofadults, numberofchildren, total_pax

In [9]:
def create_date_variables(df, date_key, week=False, month=False, year=False, dayofweek=False, dayofmonth=False, dayofyear=False):
    if week:
        df['{}_{}'.format(date_key, 'week')] = df[date_key].dt.week
    if month:
        df['{}_{}'.format(date_key, 'month')] = df[date_key].dt.month
    if year:
        df['{}_{}'.format(date_key, 'year')] = df[date_key].dt.year
    if dayofweek:
        df['{}_{}'.format(date_key, 'dayofweek')] = df[date_key].dt.dayofweek
    if dayofmonth:
        df['{}_{}'.format(date_key, 'dayofmonth')] = df[date_key].dt.day
    if dayofyear:
        df['{}_{}'.format(date_key, 'dayofyear')] = df[date_key].dt.dayofyear
    return df 

def num_div_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_div_{}".format(col1, col2)] = (df[col1]/df[col2]).replace([np.inf, -np.inf, np.nan], 0)
    return df

def num_dif_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_dif_{}".format(col1, col2)] = df[col1] - df[col2]
    return df
    
def cat_interactions(df, cat_cols):
    for col1, col2 in itertools.combinations(cat_cols, 2):
        print '{}_{}'.format(col1, col2) 
        df['{}_{}'.format(col1, col2)] = df[col1].astype(str) + '_' + df[col2].astype(str)
    return df

    
def create_group_variables(df, group_col, num_col):
    #     count, mean, std, min, max, x-mean, median, x-median, iqr 
    if isinstance(group_col, str):
        group_col = [group_col]
    if isinstance(num_col, str):
        num_col = [num_col]
        
    df_group = df.groupby(group_col)
    
    functions = {
        'mean':'mean',
        'max':'max',
        'min':'min',
        'std':'std',
        'median':'median',
#         'min-max': lambda x: x.max()- x.min()
    }
    final_columns, apply_dict = [], {}
    prefix = 'GRP_' + '_'.join(map(str, group_col))
    for col in num_col:
        methods = []            
        for key, val in functions.iteritems():
            methods.append(val)
            final_columns.append("{}#{}#{}".format(prefix , col, key)) 
        apply_dict[col] = methods
        
    df_var = df_group.agg(apply_dict)
    df_var.columns = final_columns
    df_var.reset_index(inplace=True)
    
    df = df.merge(df_var, how='left', on=group_col)

    for col in num_col:
        df["{}#{}#{}".format(prefix , col, 'min_mean')] = df[col] - df["{}#{}#{}".format(prefix , col, 'mean')]
        
    return df


In [10]:
# df.head()
df = create_date_variables(df, 'checkin_date',  week=True, month=True, year=True, dayofweek=True, dayofmonth=True, dayofyear=True)
df = create_date_variables(df, 'checkout_date',  week=True, dayofweek=True)
df = create_date_variables(df, 'booking_date',  week=True, dayofyear=True)


# df = create_date_variables(df, 'checkout_date')
# df = create_date_variables(df, 'booking_date')

In [11]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_dayofyear'],
      dtype='object')

In [12]:
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_dayofyear
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,95
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,23
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,28
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,122
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,245


In [13]:
df = cat_interactions(df, ['resort_id', 'checkin_date'])
df = cat_interactions(df, ['resort_id', 'checkout_date'])
df = cat_interactions(df, ['resort_id', 'booking_date'])
df = cat_interactions(df, ['resort_id', 'memberid'])
df = cat_interactions(df, ['resort_id', 'state_code_resort'])
df = cat_interactions(df, ['resort_id', 'state_code_residence'])

# df = cat_interactions(df, ['resort_id', 'booking_type_code'])
# df = cat_interactions(df, ['resort_id', 'channel_code'])
# df = cat_interactions(df, ['resort_id', 'season_holidayed_code'])
# df = cat_interactions(df, ['resort_id', 'member_age_buckets'])
# df = cat_interactions(df, ['checkin_date', 'season_holidayed_code'])

# df = cat_interactions(df, ['resort_id', 'checkin_date_week'])
# df = cat_interactions(df, ['resort_id', 'checkin_date_dayofweek'])
# df = cat_interactions(df, ['resort_id', 'checkout_date_week'])
# df = cat_interactions(df, ['resort_id', 'checkout_date_dayofweek'])


df = cat_interactions(df, ['persontravellingid', 'member_age_buckets', 'state_code_residence', 'state_code_resort'])

df = cat_interactions(df, ['memberid', 'channel_code'])
df = cat_interactions(df, ['memberid', 'booking_type_code'])
df = cat_interactions(df, ['memberid', 'reservationstatusid_code'])
df = cat_interactions(df, ['memberid', 'resort_type_code'])
df = cat_interactions(df, ['memberid', 'cluster_code'])
df = cat_interactions(df, ['memberid', 'room_type_booked_code'])


df = cat_interactions(df, ['memberid', 'checkin_date'])
df = cat_interactions(df, ['memberid', 'checkout_date'])
df = cat_interactions(df, ['memberid', 'booking_date'])

df = cat_interactions(df, ['memberid', 'checkin_date_month'])
df = cat_interactions(df, ['memberid', 'checkin_date_week'])
df = cat_interactions(df, ['memberid', 'checkin_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'checkout_date_week'])
df = cat_interactions(df, ['memberid', 'checkout_date_dayofweek'])

df = cat_interactions(df, ['resort_id', 'booking_roomnights'])
df = cat_interactions(df, ['memberid', 'booking_roomnights'])



resort_id_checkin_date
resort_id_checkout_date
resort_id_booking_date
resort_id_memberid
resort_id_state_code_resort
resort_id_state_code_residence
persontravellingid_member_age_buckets
persontravellingid_state_code_residence
persontravellingid_state_code_resort
member_age_buckets_state_code_residence
member_age_buckets_state_code_resort
state_code_residence_state_code_resort
memberid_channel_code
memberid_booking_type_code
memberid_reservationstatusid_code
memberid_resort_type_code
memberid_cluster_code
memberid_room_type_booked_code
memberid_checkin_date
memberid_checkout_date
memberid_booking_date
memberid_checkin_date_month
memberid_checkin_date_week
memberid_checkin_date_dayofweek
memberid_checkout_date_week
memberid_checkout_date_dayofweek
resort_id_booking_roomnights
memberid_booking_roomnights


In [14]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_dayofyear', u'resort_id_checkin_date',
       u'resort_id_checkout_date', u'resort_

In [15]:
df = num_div_interactions(df, num_cols=['numberofadults', 'numberofchildren','total_pax',  
                                        'roomnights', 'booking_in_advance_days', 'booking_roomnights', 
                                        'total_persons_travelling'])

df = num_dif_interactions(df, num_cols=[ 'roomnights', 'booking_roomnights'])
df = num_dif_interactions(df, num_cols=[ 'total_persons_travelling', 'total_pax'])
df = num_dif_interactions(df, num_cols=[ 'checkout_date_week', 'checkin_date_week', 'booking_date_week'])
df.shape
                                        

(488189, 91)

In [16]:
df.shape
df.head()

(488189, 91)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_memberid,resort_id_state_code_resort,resort_id_state_code_residence,persontravellingid_member_age_buckets,persontravellingid_state_code_residence,persontravellingid_state_code_resort,member_age_buckets_state_code_residence,member_age_buckets_state_code_resort,state_code_residence_state_code_resort,memberid_channel_code,memberid_booking_type_code,memberid_reservationstatusid_code,memberid_resort_type_code,memberid_cluster_code,memberid_room_type_booked_code,memberid_checkin_date,memberid_checkout_date,memberid_booking_date,memberid_checkin_date_month,memberid_checkin_date_week,memberid_checkin_date_dayofweek,memberid_checkout_date_week,memberid_checkout_date_dayofweek,resort_id_booking_roomnights,memberid_booking_roomnights,numberofadults_div_numberofchildren,numberofadults_div_total_pax,numberofadults_div_roomnights,numberofadults_div_booking_in_advance_days,numberofadults_div_booking_roomnights,numberofadults_div_total_persons_travelling,numberofchildren_div_total_pax,numberofchildren_div_roomnights,numberofchildren_div_booking_in_advance_days,numberofchildren_div_booking_roomnights,numberofchildren_div_total_persons_travelling,total_pax_div_roomnights,total_pax_div_booking_in_advance_days,total_pax_div_booking_roomnights,total_pax_div_total_persons_travelling,roomnights_div_booking_in_advance_days,roomnights_div_booking_roomnights,roomnights_div_total_persons_travelling,booking_in_advance_days_div_booking_roomnights,booking_in_advance_days_div_total_persons_travelling,booking_roomnights_div_total_persons_travelling,roomnights_dif_booking_roomnights,total_persons_travelling_dif_total_pax,checkout_date_week_dif_checkin_date_week,checkout_date_week_dif_booking_date_week,checkin_date_week_dif_booking_date_week
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,46_F,46_7.0,46_3,F_7.0,F_3,7.0_3,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,0.0,0.666667,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,1.5,0.0,1.0,0.5,0.0,0.0,0.5,0,-1,0,0,0
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,46_F,46_7.0,46_5,F_7.0,F_5,7.0_5,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,0.0,1.0,0.4,0.025641,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.4,0.025641,0.4,1.0,0.064103,1.0,2.5,15.6,39.0,2.5,0,0,1,12,11
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,47_F,47_7.0,47_1,F_7.0,F_1,7.0_1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,0.0,1.0,0.5,0.5,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,1.0,1.0,1.0,2.0,1.0,2.0,2.0,0,0,1,1,0
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,46_2,F_7.0,F_2,7.0_2,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,1.0,1.0,0.4,0.05,0.4,0.5,1.0,0.4,0.05,0.4,0.5,0.4,0.05,0.4,0.5,0.125,1.0,1.25,8.0,10.0,1.25,0,2,1,7,6
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,46_2,F_7.0,F_2,7.0_2,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,0.0,1.0,0.4,0.019417,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.4,0.019417,0.4,1.0,0.048544,1.0,2.5,20.6,51.5,2.5,0,0,0,15,15


In [17]:
unique_id = 'reservation_id'

date_cols = []

target_col = 'amount_spent_per_room_night_scaled'

cat_cols = [
    'memberid',
    'persontravellingid', 'member_age_buckets', 'state_code_residence', 
    'resort_id', 'resort_region_code', 'resort_type_code', 'state_code_resort', 'cluster_code', 'channel_code',
    'booking_type_code', 'room_type_booked_code', 'season_holidayed_code', 'reservationstatusid_code', 'main_product_code',
    u'checkin_date', u'checkout_date', u'booking_date',
    
u'resort_id_checkin_date',
u'resort_id_checkout_date', u'resort_id_booking_date',
u'resort_id_memberid', u'resort_id_state_code_resort',
u'resort_id_state_code_residence',
u'persontravellingid_member_age_buckets',
u'persontravellingid_state_code_residence',
u'persontravellingid_state_code_resort',
u'member_age_buckets_state_code_residence',
u'member_age_buckets_state_code_resort',
u'state_code_residence_state_code_resort', u'memberid_channel_code',
u'memberid_booking_type_code', u'memberid_reservationstatusid_code',
u'memberid_resort_type_code', u'memberid_cluster_code',
u'memberid_room_type_booked_code', u'memberid_checkin_date',
u'memberid_checkout_date', u'memberid_booking_date',
u'memberid_checkin_date_month', u'memberid_checkin_date_week',
u'memberid_checkin_date_dayofweek', u'memberid_checkout_date_week',
u'memberid_checkout_date_dayofweek', u'resort_id_booking_roomnights',
u'memberid_booking_roomnights'
    
]

columns_to_drop = []


In [18]:
from ml_modules.encoding import FreqeuncyEncoding

In [19]:
fE = FreqeuncyEncoding(categorical_columns=cat_cols, return_df=True)

In [20]:
%%time
df1 = fE.fit_transform(df)

CPU times: user 1min 35s, sys: 1min 35s, total: 3min 11s
Wall time: 49.6 s


In [21]:
df1.shape

(488189, 91)

In [22]:
df1.drop(columns=date_cols +[unique_id, target_col] + columns_to_drop, inplace=True)

In [23]:
x_train, y_train = df1[:341424].values, train[target_col].values
x_train.shape, y_train.shape

((341424, 89), (341424,))

In [24]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [25]:
from ml_modules.custom_estimator import Estimator
from ml_modules.custom_fold_generator import FoldScheme

In [26]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

#### LGBM NumLeaves 48, k10

In [27]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 48, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=10)

In [28]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.0172	valid_0's l2: 1.03469	valid_1's rmse: 1.00959	valid_1's l2: 1.01927
[200]	valid_0's rmse: 0.994958	valid_0's l2: 0.989942	valid_1's rmse: 0.986609	valid_1's l2: 0.973397
[300]	valid_0's rmse: 0.985467	valid_0's l2: 0.971144	valid_1's rmse: 0.976363	valid_1's l2: 0.953285
[400]	valid_0's rmse: 0.980383	valid_0's l2: 0.96115	valid_1's rmse: 0.970411	valid_1's l2: 0.941698
[500]	valid_0's rmse: 0.977342	valid_0's l2: 0.955198	valid_1's rmse: 0.966401	valid_1's l2: 0.933931
[600]	valid_0's rmse: 0.975206	valid_0's l2: 0.951026	valid_1's rmse: 0.963286	valid_1's l2: 0.927919
[700]	valid_0's rmse: 0.973741	valid_0's l2: 0.948171	valid_1's rmse: 0.960718	valid_1's l2: 0.92298
[800]	valid_0's rmse: 0.972635	valid_0's l2: 0.94602	valid_1's rmse: 0.958511	valid_1's l2: 0.918744
[900]	valid_0's rmse: 0.971971	valid_0's l2: 0.944727	valid_1's rmse: 0.956613	valid_1's l2: 0.915109
[1000]	valid_0's rmse: 0.97

[700]	valid_0's rmse: 0.968898	valid_0's l2: 0.938763	valid_1's rmse: 0.961203	valid_1's l2: 0.923912
[800]	valid_0's rmse: 0.968014	valid_0's l2: 0.937052	valid_1's rmse: 0.958989	valid_1's l2: 0.919661
[900]	valid_0's rmse: 0.967443	valid_0's l2: 0.935947	valid_1's rmse: 0.957062	valid_1's l2: 0.915967
[1000]	valid_0's rmse: 0.967011	valid_0's l2: 0.935111	valid_1's rmse: 0.955332	valid_1's l2: 0.912659
[1100]	valid_0's rmse: 0.966643	valid_0's l2: 0.934399	valid_1's rmse: 0.953737	valid_1's l2: 0.909615
[1200]	valid_0's rmse: 0.966387	valid_0's l2: 0.933904	valid_1's rmse: 0.95225	valid_1's l2: 0.906781
[1300]	valid_0's rmse: 0.966137	valid_0's l2: 0.93342	valid_1's rmse: 0.950836	valid_1's l2: 0.904089
[1400]	valid_0's rmse: 0.965956	valid_0's l2: 0.933072	valid_1's rmse: 0.949505	valid_1's l2: 0.90156
[1500]	valid_0's rmse: 0.965816	valid_0's l2: 0.9328	valid_1's rmse: 0.948206	valid_1's l2: 0.899094
[1600]	valid_0's rmse: 0.965704	valid_0's l2: 0.932585	valid_1's rmse: 0.946916	v

[1200]	valid_0's rmse: 0.960645	valid_0's l2: 0.922838	valid_1's rmse: 0.952903	valid_1's l2: 0.908023
[1300]	valid_0's rmse: 0.960511	valid_0's l2: 0.92258	valid_1's rmse: 0.951517	valid_1's l2: 0.905384
[1400]	valid_0's rmse: 0.960425	valid_0's l2: 0.922416	valid_1's rmse: 0.950166	valid_1's l2: 0.902816
[1500]	valid_0's rmse: 0.960368	valid_0's l2: 0.922307	valid_1's rmse: 0.948868	valid_1's l2: 0.90035
[1600]	valid_0's rmse: 0.960305	valid_0's l2: 0.922186	valid_1's rmse: 0.947603	valid_1's l2: 0.897951
[1700]	valid_0's rmse: 0.960211	valid_0's l2: 0.922005	valid_1's rmse: 0.946355	valid_1's l2: 0.895588
[1800]	valid_0's rmse: 0.960116	valid_0's l2: 0.921822	valid_1's rmse: 0.945144	valid_1's l2: 0.893297
[1900]	valid_0's rmse: 0.96006	valid_0's l2: 0.921715	valid_1's rmse: 0.943964	valid_1's l2: 0.891068
[2000]	valid_0's rmse: 0.959995	valid_0's l2: 0.92159	valid_1's rmse: 0.942789	valid_1's l2: 0.888851
[2100]	valid_0's rmse: 0.959974	valid_0's l2: 0.921551	valid_1's rmse: 0.9416

Early stopping, best iteration is:
[2448]	valid_0's rmse: 0.971296	valid_0's l2: 0.943416	valid_1's rmse: 0.936404	valid_1's l2: 0.876853
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00467	valid_0's l2: 1.00937	valid_1's rmse: 1.01098	valid_1's l2: 1.02208
[200]	valid_0's rmse: 0.982192	valid_0's l2: 0.964702	valid_1's rmse: 0.987897	valid_1's l2: 0.97594
[300]	valid_0's rmse: 0.972865	valid_0's l2: 0.946467	valid_1's rmse: 0.977583	valid_1's l2: 0.955668
[400]	valid_0's rmse: 0.967945	valid_0's l2: 0.936918	valid_1's rmse: 0.971664	valid_1's l2: 0.944131
[500]	valid_0's rmse: 0.965141	valid_0's l2: 0.931497	valid_1's rmse: 0.967626	valid_1's l2: 0.9363
[600]	valid_0's rmse: 0.963355	valid_0's l2: 0.928052	valid_1's rmse: 0.964514	valid_1's l2: 0.930286
[700]	valid_0's rmse: 0.962068	valid_0's l2: 0.925575	valid_1's rmse: 0.961945	valid_1's l2: 0.925339
[800]	valid_0's rmse: 0.961212	valid_0's l2: 0.923928	valid_1's rmse: 0.959721	valid_1's l2:

[800]	valid_0's rmse: 0.977311	valid_0's l2: 0.955136	valid_1's rmse: 0.958002	valid_1's l2: 0.917768
[900]	valid_0's rmse: 0.976709	valid_0's l2: 0.95396	valid_1's rmse: 0.956079	valid_1's l2: 0.914087
[1000]	valid_0's rmse: 0.976267	valid_0's l2: 0.953097	valid_1's rmse: 0.954306	valid_1's l2: 0.910699
[1100]	valid_0's rmse: 0.975902	valid_0's l2: 0.952385	valid_1's rmse: 0.952721	valid_1's l2: 0.907677
[1200]	valid_0's rmse: 0.975659	valid_0's l2: 0.95191	valid_1's rmse: 0.951259	valid_1's l2: 0.904894
[1300]	valid_0's rmse: 0.975452	valid_0's l2: 0.951507	valid_1's rmse: 0.949861	valid_1's l2: 0.902237
[1400]	valid_0's rmse: 0.975373	valid_0's l2: 0.951352	valid_1's rmse: 0.948547	valid_1's l2: 0.899741
[1500]	valid_0's rmse: 0.975269	valid_0's l2: 0.951149	valid_1's rmse: 0.947262	valid_1's l2: 0.897305
[1600]	valid_0's rmse: 0.975185	valid_0's l2: 0.950985	valid_1's rmse: 0.946005	valid_1's l2: 0.894925
[1700]	valid_0's rmse: 0.975116	valid_0's l2: 0.950852	valid_1's rmse: 0.9447

In [30]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9655818868688781,
 0.0047812399925004885,
 [0.9692755255855224,
  0.9659458280360866,
  0.9647994605902984,
  0.9643063826580002,
  0.9597187907926836,
  0.9624845363925991,
  0.971295818064223,
  0.9584657444938799,
  0.9647454176003509,
  0.9747813644751356])

In [29]:
joblib.dump(est, 'lgb-91-lve48-k10.pkl')

['lgb-91-lve48-k10.pkl']

In [31]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

  return np.mean(np.column_stack((est.predict(x) for est in self.fitted_models)), axis=1)


In [32]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve48-k10-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve48-k10-train.csv', index=False)

#### lgbm numleaves48 k5 

In [35]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 48, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse)

In [36]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.01443	valid_0's l2: 1.02906	valid_1's rmse: 1.00936	valid_1's l2: 1.01881
[200]	valid_0's rmse: 0.992306	valid_0's l2: 0.984672	valid_1's rmse: 0.986171	valid_1's l2: 0.972534
[300]	valid_0's rmse: 0.982952	valid_0's l2: 0.966194	valid_1's rmse: 0.975849	valid_1's l2: 0.952281
[400]	valid_0's rmse: 0.978065	valid_0's l2: 0.956612	valid_1's rmse: 0.969779	valid_1's l2: 0.940471
[500]	valid_0's rmse: 0.975171	valid_0's l2: 0.950959	valid_1's rmse: 0.965634	valid_1's l2: 0.932448
[600]	valid_0's rmse: 0.973256	valid_0's l2: 0.947228	valid_1's rmse: 0.962384	valid_1's l2: 0.926182
[700]	valid_0's rmse: 0.971878	valid_0's l2: 0.944547	valid_1's rmse: 0.959682	valid_1's l2: 0.920989
[800]	valid_0's rmse: 0.970975	valid_0's l2: 0.942793	valid_1's rmse: 0.957409	valid_1's l2: 0.916632
[900]	valid_0's rmse: 0.970322	valid_0's l2: 0.941526	valid_1's rmse: 0.955334	valid_1's l2: 0.912663
[1000]	valid_0's rmse: 

[2000]	valid_0's rmse: 0.961808	valid_0's l2: 0.925074	valid_1's rmse: 0.94076	valid_1's l2: 0.885029
[2100]	valid_0's rmse: 0.961754	valid_0's l2: 0.92497	valid_1's rmse: 0.939497	valid_1's l2: 0.882654
[2200]	valid_0's rmse: 0.961731	valid_0's l2: 0.924926	valid_1's rmse: 0.938223	valid_1's l2: 0.880262
[2300]	valid_0's rmse: 0.961713	valid_0's l2: 0.924892	valid_1's rmse: 0.936985	valid_1's l2: 0.877941
[2400]	valid_0's rmse: 0.961702	valid_0's l2: 0.924871	valid_1's rmse: 0.935781	valid_1's l2: 0.875686
[2500]	valid_0's rmse: 0.961675	valid_0's l2: 0.924818	valid_1's rmse: 0.934603	valid_1's l2: 0.873483
[2600]	valid_0's rmse: 0.961672	valid_0's l2: 0.924813	valid_1's rmse: 0.933432	valid_1's l2: 0.871294
[2700]	valid_0's rmse: 0.961663	valid_0's l2: 0.924796	valid_1's rmse: 0.932253	valid_1's l2: 0.869095
[2800]	valid_0's rmse: 0.96169	valid_0's l2: 0.924847	valid_1's rmse: 0.931069	valid_1's l2: 0.86689
Early stopping, best iteration is:
[2634]	valid_0's rmse: 0.96166	valid_0's l

In [37]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9661848818772618,
 0.002862755969325536,
 [0.9682706106996097,
  0.9654206164421347,
  0.9616601120476985,
  0.9655019821627868,
  0.9700710880340793])

In [38]:
joblib.dump(est, 'lgb-91-lve48-k5.pkl')

['lgb-91-lve48-k5.pkl']

In [39]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [40]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve48-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve48-k5-train.csv', index=False)

#### lgbm numleaves 100 k5 

In [42]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 100, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse)

In [43]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00948	valid_0's l2: 1.01904	valid_1's rmse: 1.00318	valid_1's l2: 1.00637
[200]	valid_0's rmse: 0.987674	valid_0's l2: 0.9755	valid_1's rmse: 0.978845	valid_1's l2: 0.958138
[300]	valid_0's rmse: 0.97921	valid_0's l2: 0.958853	valid_1's rmse: 0.96771	valid_1's l2: 0.936462
[400]	valid_0's rmse: 0.974947	valid_0's l2: 0.950521	valid_1's rmse: 0.960598	valid_1's l2: 0.922748
[500]	valid_0's rmse: 0.972536	valid_0's l2: 0.945827	valid_1's rmse: 0.955261	valid_1's l2: 0.912523
[600]	valid_0's rmse: 0.971022	valid_0's l2: 0.942883	valid_1's rmse: 0.950758	valid_1's l2: 0.90394
[700]	valid_0's rmse: 0.970046	valid_0's l2: 0.940989	valid_1's rmse: 0.946805	valid_1's l2: 0.89644
[800]	valid_0's rmse: 0.969398	valid_0's l2: 0.939733	valid_1's rmse: 0.943235	valid_1's l2: 0.889692
[900]	valid_0's rmse: 0.969003	valid_0's l2: 0.938967	valid_1's rmse: 0.94	valid_1's l2: 0.8836
[1000]	valid_0's rmse: 0.968734	val

[1300]	valid_0's rmse: 0.965702	valid_0's l2: 0.932581	valid_1's rmse: 0.929414	valid_1's l2: 0.863811
[1400]	valid_0's rmse: 0.96567	valid_0's l2: 0.932519	valid_1's rmse: 0.92689	valid_1's l2: 0.859125
[1500]	valid_0's rmse: 0.965643	valid_0's l2: 0.932467	valid_1's rmse: 0.924426	valid_1's l2: 0.854563
[1600]	valid_0's rmse: 0.965622	valid_0's l2: 0.932426	valid_1's rmse: 0.921989	valid_1's l2: 0.850064
[1700]	valid_0's rmse: 0.965584	valid_0's l2: 0.932353	valid_1's rmse: 0.919572	valid_1's l2: 0.845613
[1800]	valid_0's rmse: 0.965636	valid_0's l2: 0.932452	valid_1's rmse: 0.9172	valid_1's l2: 0.841256
Early stopping, best iteration is:
[1695]	valid_0's rmse: 0.965576	valid_0's l2: 0.932337	valid_1's rmse: 0.919693	valid_1's l2: 0.845835
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.01019	valid_0's l2: 1.02049	valid_1's rmse: 1.00294	valid_1's l2: 1.00589
[200]	valid_0's rmse: 0.988915	valid_0's l2: 0.977953	valid_1's rmse: 0.978427	valid_1'

In [44]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9662415813819818,
 0.0027990733128842944,
 [0.9681611726571758,
  0.9655956322334376,
  0.9617876315483747,
  0.9655759506255693,
  0.9700875198453517])

In [45]:
joblib.dump(est, 'lgb-91-lve100-k5.pkl')

['lgb-91-lve100-k5.pkl']

In [46]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [47]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve100-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-91-lve100-k5-train.csv', index=False)

#### xgb k10 

In [39]:
est = Estimator(model=XGBRegressor(**{
        'learning_rate':0.01,
        'n_estimators': 10000, 
        'max_depth': 6,
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'gamma' : 0.5,
        'min_child_weight': 150, 
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse)

In [40]:
train_preds = est.fit_transform(x_train, y_train)

[0]	validation_0-rmse:7.23308
Will train until validation_0-rmse hasn't improved in 200 rounds.
[100]	validation_0-rmse:2.8076
[200]	validation_0-rmse:1.38045
[300]	validation_0-rmse:1.04542
[400]	validation_0-rmse:0.988587
[500]	validation_0-rmse:0.978535
[600]	validation_0-rmse:0.975618
[700]	validation_0-rmse:0.97392
[800]	validation_0-rmse:0.972673
[900]	validation_0-rmse:0.971804
[1000]	validation_0-rmse:0.971125
[1100]	validation_0-rmse:0.970635
[1200]	validation_0-rmse:0.970305
[1300]	validation_0-rmse:0.970036
[1400]	validation_0-rmse:0.969814
[1500]	validation_0-rmse:0.969633
[1600]	validation_0-rmse:0.96946
[1700]	validation_0-rmse:0.969302
[1800]	validation_0-rmse:0.969187
[1900]	validation_0-rmse:0.969093
[2000]	validation_0-rmse:0.968978
[2100]	validation_0-rmse:0.968889
[2200]	validation_0-rmse:0.968829
[2300]	validation_0-rmse:0.968756
[2400]	validation_0-rmse:0.968714
[2500]	validation_0-rmse:0.968675
[2600]	validation_0-rmse:0.96862
[2700]	validation_0-rmse:0.968588
[2

In [48]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9662086294035678,
 0.002893112726706158,
 [0.9683551144881409,
  0.9653390830621554,
  0.9616679159836808,
  0.9655445630994086,
  0.9701364703844536])

In [55]:
# joblib.dump(est, 'xgb-91-k5.pkl')

In [49]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

  return np.mean(np.column_stack((est.predict(x) for est in self.fitted_models)), axis=1)


In [50]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('xgb-91-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('xgb-91-k5-train.csv', index=False)