#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

import time

#### Reading Data

In [2]:
train=pd.read_csv('train.csv',parse_dates=['week'])
test=pd.read_csv('test.csv',parse_dates=['week'])
sub=pd.read_csv('sub.csv')

In [3]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,2011-01-17,8091,216418,99.0375,111.8625,0,0,20
1,2,2011-01-17,8091,216419,99.0375,99.0375,0,0,28
2,3,2011-01-17,8091,216425,133.95,133.95,0,0,19
3,4,2011-01-17,8091,216233,133.95,133.95,0,0,44
4,5,2011-01-17,8091,217390,141.075,141.075,0,0,52


In [4]:
train.tail()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
150145,212638,2013-07-09,9984,223245,235.8375,235.8375,0,0,38
150146,212639,2013-07-09,9984,223153,235.8375,235.8375,0,0,30
150147,212642,2013-07-09,9984,245338,357.675,483.7875,1,1,31
150148,212643,2013-07-09,9984,547934,141.7875,191.6625,0,1,12
150149,212644,2013-07-09,9984,679023,234.4125,234.4125,0,0,15


In [5]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku
0,212645,2013-07-16,8091,216418,108.3,108.3,0,0
1,212646,2013-07-16,8091,216419,109.0125,109.0125,0,0
2,212647,2013-07-16,8091,216425,133.95,133.95,0,0
3,212648,2013-07-16,8091,216233,133.95,133.95,0,0
4,212649,2013-07-16,8091,217390,176.7,176.7,0,0


In [6]:
test.tail()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku
13855,232281,2013-10-01,9984,223245,241.5375,241.5375,0,0
13856,232282,2013-10-01,9984,223153,240.825,240.825,0,0
13857,232285,2013-10-01,9984,245338,382.6125,401.85,1,1
13858,232286,2013-10-01,9984,547934,191.6625,191.6625,0,0
13859,232287,2013-10-01,9984,679023,234.4125,234.4125,0,0


In [7]:
train['units_sold'].describe()

count    150150.000000
mean         51.674206
std          60.207904
min           1.000000
25%          20.000000
50%          35.000000
75%          62.000000
max        2876.000000
Name: units_sold, dtype: float64

#### Concating

In [8]:
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


#### Date Based Feature. Week_Number helps to create time based validation on Weeks on laterpart.

In [9]:
epoch= pd.Timestamp("2011-12-23")
def create_date_featues(df):

    df['Year'] = pd.to_datetime(df['week']).dt.year

    df['Month'] = pd.to_datetime(df['week']).dt.month

    df['Day'] = pd.to_datetime(df['week']).dt.dayofweek

    df['woy'] = pd.to_datetime(df['week']).dt.week
    
    df['wom'] = df['week'].apply(lambda d: (d.day-1) // 7 + 1)
    
    df["Week_Number"]=np.where(df.week.astype("datetime64").le(epoch), \
                               df.week.dt.week, \
                               df.week.sub(epoch).dt.days//7+52)

    #df['Quarter'] = pd.to_datetime(df['week']).dt.quarter 

    #df['Is_month_start'] = pd.to_datetime(df['week']).dt.is_month_start

    #df['Is_month_end'] = pd.to_datetime(df['DateTime']).dt.is_month_end

    #df['Is_quarter_start'] = pd.to_datetime(df['DateTime']).dt.is_quarter_start

    #df['Is_quarter_end'] = pd.to_datetime(df['DateTime']).dt.is_quarter_end

    #df['Is_year_start'] = pd.to_datetime(df['week']).dt.is_year_start

    #df['Is_year_end'] = pd.to_datetime(df['Week']).dt.is_year_end

    #df['Semester'] = np.where(df['Quarter'].isin([1,2]),1,2)

    return df

In [10]:
df=create_date_featues(df)

In [11]:
df.head()

Unnamed: 0,base_price,is_display_sku,is_featured_sku,record_ID,sku_id,store_id,total_price,train_or_test,units_sold,week,Year,Month,Day,woy,wom,Week_Number
0,111.8625,0,0,1,216418,8091,99.0375,train,20.0,2011-01-17,2011,1,0,3,3,3
1,99.0375,0,0,2,216419,8091,99.0375,train,28.0,2011-01-17,2011,1,0,3,3,3
2,133.95,0,0,3,216425,8091,133.95,train,19.0,2011-01-17,2011,1,0,3,3,3
3,133.95,0,0,4,216233,8091,133.95,train,44.0,2011-01-17,2011,1,0,3,3,3
4,141.075,0,0,5,217390,8091,141.075,train,52.0,2011-01-17,2011,1,0,3,3,3


In [12]:
df1=df.copy()

In [13]:
df['Unique_sku_per_store']=df.groupby(['store_id'])['sku_id'].transform('nunique')

#### Creates numerical based aggregate features.

In [53]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 
    
    """
    
            
    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').drop(columns={'units_sold','store_id','sku_id','record_ID','Year','Month','wom',#'woy','Week_Number'
                                                         }).copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [15]:
df['Discount_Price']=df['base_price']-df['total_price']
df['Percentage_Discount']=((df['base_price']-df['total_price'])/df['base_price'])*100

In [16]:
store_agg = agg_numeric(df, 'store_id', 'store_')
print('Previous aggregation shape: ', store_agg.shape)
df=df.merge(store_agg, on ='store_id', how = 'left')

Previous aggregation shape:  (76, 35)


In [17]:
sku_agg = agg_numeric(df1, 'sku_id', 'sku_')
print('Previous aggregation shape: ', sku_agg.shape)
df=df.merge(sku_agg, on ='sku_id', how = 'left')

Previous aggregation shape:  (28, 26)


In [18]:
week_agg = agg_numeric(df1, ['store_id','sku_id'], 'week_')
print('Previous aggregation shape: ', week_agg.shape)
df=df.merge(week_agg, on =['store_id','sku_id'], how = 'left')

Previous aggregation shape:  (1155, 27)


#### Intercation Feat

In [20]:
df['store_id']=df['store_id'].astype(str)
df['sku_id']=df['sku_id'].astype(str)
df['is_display_sku']=df['is_display_sku'].astype(str)

In [21]:
df['store_sku']=df['store_id']+'_'+df['sku_id']
df['sku_disp']=df['sku_id']+'_'+df['is_display_sku']

In [22]:
df['store_id']=df['store_id'].astype(int)
df['sku_id']=df['sku_id'].astype(int)
df['is_display_sku']=df['is_display_sku'].astype(int)

Label Encoding

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['store_sku','sku_disp']:
    df[col]=  df[col].astype('str')
    df[col]= le.fit_transform(df[col]) 

In [24]:
fe_pol = (df.groupby('store_id').size()) / len(df)
df['store_id_fe'] = df['store_id'].apply(lambda x : fe_pol[x])

In [25]:
fe_pol = (df.groupby('sku_id').size()) / len(df)
df['sku_id_fe'] = df['sku_id'].apply(lambda x : fe_pol[x])

In [26]:
train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [27]:
train.tail()

Unnamed: 0,base_price,is_display_sku,is_featured_sku,record_ID,sku_id,store_id,total_price,units_sold,week,Year,Month,Day,woy,wom,Week_Number,Unique_sku_per_store,Discount_Price,Percentage_Discount,store__Discount_Price_min,store__Percentage_Discount_min,store__is_display_sku_min,store__is_featured_sku_mean,store__is_display_sku_mean,store__Day_mean,store__is_display_sku_max,store__Week_Number_min,store__Percentage_Discount_mean,store__Discount_Price_mean,store__Unique_sku_per_store_mean,store__woy_mean,store__woy_max,store__total_price_min,store__base_price_min,store__Week_Number_mean,store__Percentage_Discount_max,store__Week_Number_max,store__total_price_mean,store__base_price_mean,store__Discount_Price_max,store__is_featured_sku_sum,store__base_price_max,store__total_price_max,store__is_display_sku_sum,store__Day_sum,store__total_price_count,store__base_price_count,store__Percentage_Discount_sum,store__Discount_Price_sum,store__Unique_sku_per_store_sum,store__woy_sum,...,sku__woy_mean,sku__woy_max,sku__Week_Number_mean,sku__total_price_min,sku__base_price_min,sku__total_price_mean,sku__base_price_mean,sku__base_price_max,sku__total_price_max,sku__Week_Number_max,sku__is_display_sku_sum,sku__is_featured_sku_sum,sku__Day_sum,sku__total_price_count,sku__base_price_count,sku__woy_sum,sku__Week_Number_sum,sku__total_price_sum,sku__base_price_sum,week__is_display_sku_min,week__is_featured_sku_mean,week__is_display_sku_mean,week__Day_mean,week__is_featured_sku_max,week__is_display_sku_max,week__Day_max,week__Week_Number_min,week__is_featured_sku_sum,week__is_display_sku_sum,week__woy_mean,week__woy_max,week__Week_Number_mean,week__Day_sum,week__total_price_min,week__base_price_min,week__total_price_mean,week__base_price_mean,week__total_price_max,week__base_price_max,week__total_price_count,week__base_price_count,week__Week_Number_max,week__woy_sum,week__Week_Number_sum,week__total_price_sum,week__base_price_sum,store_sku,sku_disp,store_id_fe,sku_id_fe
150145,235.8375,0,0,212638,223245,9984,235.8375,38.0,2013-07-09,2013,7,1,28,2,132,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,60.5625,141.075,205.052152,215.930321,275.025,275.025,144,1358,1134,6059,10366,10366,260829,761901,2125571.0,2238334.0,0,0.112676,0.176056,0.584507,1,1,1,3,16,25,25.161972,52,73.5,83,106.875,191.6625,210.252729,222.841901,241.5375,241.5375,142,142,144,3573,10437,29855.8875,31643.55,1148,26,0.014719,0.063203
150146,235.8375,0,0,212639,223153,9984,235.8375,30.0,2013-07-09,2013,7,1,28,2,132,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,103.3125,124.6875,206.603718,220.995769,268.6125,268.6125,144,1300,1427,4814,8236,8236,207234,605346,1701588.0,1820121.0,0,0.176056,0.147887,0.584507,1,1,1,3,25,21,25.161972,52,73.5,83,127.5375,213.0375,212.651144,227.30757,240.825,240.825,142,142,144,3573,10437,30196.4625,32277.675,1147,24,0.014719,0.050216
150147,483.7875,1,1,212642,245338,9984,357.675,31.0,2013-07-09,2013,7,1,28,2,132,17,126.1125,26.067747,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,297.1125,355.5375,429.341899,473.4724,533.6625,533.6625,144,2249,2609,5312,9087,9088,228672,667968,3901430.0,4302917.0,0,0.295775,0.232394,0.584507,1,1,1,3,42,33,25.161972,52,73.5,83,318.4875,401.85,422.341901,463.416021,483.7875,483.7875,142,142,144,3573,10437,59972.55,65805.075,1149,29,0.014719,0.055411
150148,191.6625,1,0,212643,547934,9984,141.7875,12.0,2013-07-09,2013,7,1,28,2,132,17,49.875,26.022305,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,71.25,127.5375,167.703109,175.244571,191.6625,191.6625,144,336,0,2573,4402,4402,110763,323547,738229.1,771426.6,0,0.0,0.042254,0.584507,0,1,1,3,0,6,25.161972,52,73.5,83,127.5375,127.5375,165.380282,174.251408,191.6625,191.6625,142,142,144,3573,10437,23484.0,24743.7,1153,49,0.014719,0.02684
150149,234.4125,0,0,212644,679023,9984,234.4125,15.0,2013-07-09,2013,7,1,28,2,132,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,146.775,177.4125,199.379577,211.24956,234.4125,234.4125,144,338,68,747,1278,1278,32157,93933,254807.1,269976.9,0,0.056338,0.28169,0.584507,1,1,1,3,8,40,25.161972,52,73.5,83,166.0125,177.4125,198.376056,210.373151,234.4125,234.4125,142,142,144,3573,10437,28169.4,29872.9875,1154,54,0.014719,0.007792


In [28]:
test.tail()

Unnamed: 0,base_price,is_display_sku,is_featured_sku,record_ID,sku_id,store_id,total_price,units_sold,week,Year,Month,Day,woy,wom,Week_Number,Unique_sku_per_store,Discount_Price,Percentage_Discount,store__Discount_Price_min,store__Percentage_Discount_min,store__is_display_sku_min,store__is_featured_sku_mean,store__is_display_sku_mean,store__Day_mean,store__is_display_sku_max,store__Week_Number_min,store__Percentage_Discount_mean,store__Discount_Price_mean,store__Unique_sku_per_store_mean,store__woy_mean,store__woy_max,store__total_price_min,store__base_price_min,store__Week_Number_mean,store__Percentage_Discount_max,store__Week_Number_max,store__total_price_mean,store__base_price_mean,store__Discount_Price_max,store__is_featured_sku_sum,store__base_price_max,store__total_price_max,store__is_display_sku_sum,store__Day_sum,store__total_price_count,store__base_price_count,store__Percentage_Discount_sum,store__Discount_Price_sum,store__Unique_sku_per_store_sum,store__woy_sum,...,sku__woy_mean,sku__woy_max,sku__Week_Number_mean,sku__total_price_min,sku__base_price_min,sku__total_price_mean,sku__base_price_mean,sku__base_price_max,sku__total_price_max,sku__Week_Number_max,sku__is_display_sku_sum,sku__is_featured_sku_sum,sku__Day_sum,sku__total_price_count,sku__base_price_count,sku__woy_sum,sku__Week_Number_sum,sku__total_price_sum,sku__base_price_sum,week__is_display_sku_min,week__is_featured_sku_mean,week__is_display_sku_mean,week__Day_mean,week__is_featured_sku_max,week__is_display_sku_max,week__Day_max,week__Week_Number_min,week__is_featured_sku_sum,week__is_display_sku_sum,week__woy_mean,week__woy_max,week__Week_Number_mean,week__Day_sum,week__total_price_min,week__base_price_min,week__total_price_mean,week__base_price_mean,week__total_price_max,week__base_price_max,week__total_price_count,week__base_price_count,week__Week_Number_max,week__woy_sum,week__Week_Number_sum,week__total_price_sum,week__base_price_sum,store_sku,sku_disp,store_id_fe,sku_id_fe
164005,241.5375,0,0,232281,223245,9984,241.5375,,2013-10-01,2013,10,1,40,1,144,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,60.5625,141.075,205.052152,215.930321,275.025,275.025,144,1358,1134,6059,10366,10366,260829,761901,2125571.0,2238334.0,0,0.112676,0.176056,0.584507,1,1,1,3,16,25,25.161972,52,73.5,83,106.875,191.6625,210.252729,222.841901,241.5375,241.5375,142,142,144,3573,10437,29855.8875,31643.55,1148,26,0.014719,0.063203
164006,240.825,0,0,232282,223153,9984,240.825,,2013-10-01,2013,10,1,40,1,144,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,103.3125,124.6875,206.603718,220.995769,268.6125,268.6125,144,1300,1427,4814,8236,8236,207234,605346,1701588.0,1820121.0,0,0.176056,0.147887,0.584507,1,1,1,3,25,21,25.161972,52,73.5,83,127.5375,213.0375,212.651144,227.30757,240.825,240.825,142,142,144,3573,10437,30196.4625,32277.675,1147,24,0.014719,0.050216
164007,401.85,1,1,232285,245338,9984,382.6125,,2013-10-01,2013,10,1,40,1,144,17,19.2375,4.787234,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,297.1125,355.5375,429.341899,473.4724,533.6625,533.6625,144,2249,2609,5312,9087,9088,228672,667968,3901430.0,4302917.0,0,0.295775,0.232394,0.584507,1,1,1,3,42,33,25.161972,52,73.5,83,318.4875,401.85,422.341901,463.416021,483.7875,483.7875,142,142,144,3573,10437,59972.55,65805.075,1149,29,0.014719,0.055411
164008,191.6625,0,0,232286,547934,9984,191.6625,,2013-10-01,2013,10,1,40,1,144,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,71.25,127.5375,167.703109,175.244571,191.6625,191.6625,144,336,0,2573,4402,4402,110763,323547,738229.1,771426.6,0,0.0,0.042254,0.584507,0,1,1,3,0,6,25.161972,52,73.5,83,127.5375,127.5375,165.380282,174.251408,191.6625,191.6625,142,142,144,3573,10437,23484.0,24743.7,1153,48,0.014719,0.02684
164009,234.4125,0,0,232287,679023,9984,234.4125,,2013-10-01,2013,10,1,40,1,144,17,0.0,0.0,-21.375,-10.740741,0,0.081607,0.091964,0.584507,1,3,4.57264,10.24418,17,25.161972,52,66.975,68.4,73.5,60.501567,144,187.471619,197.715798,151.05,197,483.7875,483.7875,222,1411,2414,2414,11038.352161,24729.45,41038,60741,...,25.161972,52,73.5,146.775,177.4125,199.379577,211.24956,234.4125,234.4125,144,338,68,747,1278,1278,32157,93933,254807.1,269976.9,0,0.056338,0.28169,0.584507,1,1,1,3,8,40,25.161972,52,73.5,83,166.0125,177.4125,198.376056,210.373151,234.4125,234.4125,142,142,144,3573,10437,28169.4,29872.9875,1154,54,0.014719,0.007792


#### Time Based Validation


#### Valset1-4 weeks

In [30]:
train1=train[train['Week_Number']<129]
val1 = train[train['Week_Number']>128]

#### valset2-8weeks

In [32]:
train2=train[train['Week_Number']<125]
val2 = train[train['Week_Number']>124]

In [33]:
x_train1=train1.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=1)
y_train1=train1.loc[:,['units_sold']]
#y_train1=y_train1.clip(0,1000)
x_val1=val1.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=1)
y_val1=val1.loc[:,['units_sold']]

In [34]:
x_train2=train2.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=2)
y_train2=train2.loc[:,['units_sold']]
#y_train2=y_train2.clip(0,1000)
x_val2=val2.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=2)
y_val2=val2.loc[:,['units_sold']]

In [35]:
y_train1=np.log(y_train1)
y_val1=np.log(y_val1)

y_train2=np.log(y_train2)
y_val2=np.log(y_val2)

In [36]:
from lightgbm import LGBMRegressor

In [37]:
m = LGBMRegressor(n_estimators=3500, 
                  num_leaves=127, 
                  max_depth=8,
                  min_child_samples=4,
                  learning_rate=0.02,
                  colsample_bytree=0.4,
                  reg_alpha=0.5,
                  reg_lambda=2)
m.fit(x_train1, y_train1,eval_set=[(x_train1,y_train1),(x_val1, y_val1)],early_stopping_rounds=100, verbose=100,eval_metric='rmse')

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.47494	training's l2: 0.225568	valid_1's rmse: 0.479689	valid_1's l2: 0.230101
[200]	training's rmse: 0.420943	training's l2: 0.177193	valid_1's rmse: 0.43218	valid_1's l2: 0.18678
[300]	training's rmse: 0.395993	training's l2: 0.156811	valid_1's rmse: 0.41929	valid_1's l2: 0.175804
[400]	training's rmse: 0.382832	training's l2: 0.14656	valid_1's rmse: 0.414114	valid_1's l2: 0.17149
[500]	training's rmse: 0.372577	training's l2: 0.138813	valid_1's rmse: 0.410742	valid_1's l2: 0.168709
[600]	training's rmse: 0.365515	training's l2: 0.133601	valid_1's rmse: 0.409186	valid_1's l2: 0.167433
[700]	training's rmse: 0.359051	training's l2: 0.128918	valid_1's rmse: 0.408462	valid_1's l2: 0.166841
[800]	training's rmse: 0.353555	training's l2: 0.125001	valid_1's rmse: 0.407826	valid_1's l2: 0.166322
[900]	training's rmse: 0.34883	training's l2: 0.121682	valid_1's rmse: 0.406688	valid_1's l2: 0.165395
[1000]	tr

LGBMRegressor(colsample_bytree=0.4, learning_rate=0.02, max_depth=8,
              min_child_samples=4, n_estimators=3500, num_leaves=127,
              reg_alpha=0.5, reg_lambda=2)

In [38]:
pred = m.predict(x_val1)

In [39]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error( np.exp(y_val1), np.exp(pred) ))

0.3786919536158003

#### calculates best weight

In [40]:
pred1  = m.predict(x_val1)
print("weight correction")
W=[(0.990+(i/1000)) for i in range(20)]
S =[]
for w in W:
    error = np.sqrt(mean_squared_log_error(np.exp(y_val1), np.exp(pred1*w)))
    print('RMSLE for {:.3f}:{:.6f}'.format(w,error))
    S.append(error)
Score = pd.Series(S,index=W)
Score.plot()
BS = Score[Score.values == Score.values.min()]
print ('Best weight for Score:{}'.format(BS))

weight correction
RMSLE for 0.990:0.387707
RMSLE for 0.991:0.386672
RMSLE for 0.992:0.385667
RMSLE for 0.993:0.384691
RMSLE for 0.994:0.383744
RMSLE for 0.995:0.382827
RMSLE for 0.996:0.381939
RMSLE for 0.997:0.381082
RMSLE for 0.998:0.380255
RMSLE for 0.999:0.379458
RMSLE for 1.000:0.378692
RMSLE for 1.001:0.377957
RMSLE for 1.002:0.377252
RMSLE for 1.003:0.376579
RMSLE for 1.004:0.375938
RMSLE for 1.005:0.375328
RMSLE for 1.006:0.374749
RMSLE for 1.007:0.374203
RMSLE for 1.008:0.373688
RMSLE for 1.009:0.373206
Best weight for Score:1.009    0.373206
dtype: float64


In [41]:
m = LGBMRegressor(n_estimators=3000, 
                  num_leaves=127, 
                  max_depth=8,
                  min_child_samples=4,
                  learning_rate=0.02,
                  colsample_bytree=0.4,
                  reg_alpha=0.5,
                  reg_lambda=2)
m.fit(x_train2, y_train2,eval_set=[(x_train2,y_train2),(x_val2, y_val2)],early_stopping_rounds=200, verbose=200,eval_metric='rmse')

Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.420807	training's l2: 0.177079	valid_1's rmse: 0.442372	valid_1's l2: 0.195693
[400]	training's rmse: 0.382195	training's l2: 0.146073	valid_1's rmse: 0.422235	valid_1's l2: 0.178282
[600]	training's rmse: 0.365113	training's l2: 0.133307	valid_1's rmse: 0.415723	valid_1's l2: 0.172826
[800]	training's rmse: 0.352482	training's l2: 0.124244	valid_1's rmse: 0.41208	valid_1's l2: 0.16981
[1000]	training's rmse: 0.343654	training's l2: 0.118098	valid_1's rmse: 0.411478	valid_1's l2: 0.169314
[1200]	training's rmse: 0.335639	training's l2: 0.112654	valid_1's rmse: 0.411214	valid_1's l2: 0.169097
[1400]	training's rmse: 0.329269	training's l2: 0.108418	valid_1's rmse: 0.410832	valid_1's l2: 0.168783
[1600]	training's rmse: 0.323377	training's l2: 0.104573	valid_1's rmse: 0.410521	valid_1's l2: 0.168527
Early stopping, best iteration is:
[1550]	training's rmse: 0.324693	training's l2: 0.105426	valid_1's rm

LGBMRegressor(colsample_bytree=0.4, learning_rate=0.02, max_depth=8,
              min_child_samples=4, n_estimators=3000, num_leaves=127,
              reg_alpha=0.5, reg_lambda=2)

In [42]:
pred = m.predict(x_val2)

In [43]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error( np.exp(y_val2), np.exp(pred) ))

0.379947109849214

#### Final Model

In [46]:
x=train.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=1)
y=train.loc[:,['units_sold']]
test=test.drop(columns={'record_ID','week','units_sold','Week_Number'},axis=1)

#### Clipping the target data, got ideal clipping value from validation since clipping for 1000 gave best score

In [47]:
y=y.clip(1000)

In [48]:
y=np.log(y)

In [50]:
# m = LGBMRegressor(n_estimators=3000, 
#                   num_leaves=127, 
#                   max_depth=8,
#                   min_child_samples=4,
#                   learning_rate=0.02,
#                   colsample_bytree=0.4,
#                   reg_alpha=0.5,
#                   reg_lambda=2)
# m.fit(x,y)

In [51]:
# pred=m.predict(test)*1.009
# sub['units_sold']=np.exp(pred)
# sub.to_csv('lgbm.csv',index=False)

In [52]:
# sub.head()