In [None]:
import pandas as pd 
import numpy as np
import lightgbm as lgb
import sklearn
import sys, os
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor, cv
from sklearn.model_selection import KFold 
from tqdm import tqdm_notebook
from  sklearn.preprocessing import LabelEncoder
from calendar import monthrange
from itertools import product, chain 
import gc

import re
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

import matplotlib.pyplot as plt
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',100)

In [None]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

We start with a little bit of EDA.

In [None]:
plt.subplot(1,2,2)  ### outliers
plt.plot(sales['item_price'])
plt.legend('price')

plt.subplot(1,2,1)
plt.plot(sales['item_cnt_day'])
plt.legend(('count'))

I decided to remove those outliers.

In [None]:
sales = sales[sales['item_price'] < 250000]
sales = sales[sales['item_cnt_day'] < 1000]

Since I don't speak russian , I found help in several kernels in Kaggle in order to extract valuables informations from sales,shop and items data by looking at the names. This one especially : https://www.kaggle.com/kyakovlev/1st-place-solution-part-1-hands-on-data

# **Feature engineering**

SALES FEATURES 

First, some shops are the same. Let's remove it. 

In [None]:
# Якутск Орджоникидзе, 56                            ## thanks to Denis Larionov ##
sales.loc[sales.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales.loc[sales.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales.loc[sales.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

sales.loc[sales.shop_id == 39, 'shop_id'] = 40
test.loc[test.shop_id == 39, 'shop_id'] = 40

We can extract time data.

In [None]:
sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')
sales['month'] = sales['date'].dt.month
sales['year'] = sales['date'].dt.year

sales = sales.drop('date', axis = 1)

I merge the test data with the sales data.

In [None]:
to_append = test[['shop_id', 'item_id']].copy()

to_append['date_block_num'] = sales['date_block_num'].max() + 1
to_append['year'] = 2015
to_append['month'] = 11
to_append['item_cnt_day'] = 0
to_append['item_price'] = 0

sales = pd.concat([sales, to_append], ignore_index=True, sort=False)

Let's add a holiday feature.

In [None]:
holiday_dict = {
    1: 7,
    2: 4,
    3: 3,
    4: 9,
    5: 4,
    6: 4,
    7: 3,
    8: 9,
    9: 5,
    10: 9,
    11: 6,
    12: 5,
}

sales['holidays_in_month'] = sales['month'].map(holiday_dict)

I make a small dataframe containing only time data.

In [None]:
period = sales[['date_block_num','year','month','holidays_in_month']].drop_duplicates().reset_index(drop=True)
period['days'] = period.apply(lambda r: monthrange(r.year, r.month)[1], axis=1)

SHOP FEATURES

Thanks to the name of shops, we can extract the city and the type of the shop   and then label encode it. 

In [None]:
cities = shops['shop_name'].str.split(' ').map(lambda row : row[0])
cities.unique()

In [None]:
shops['shop_city'] = cities
shops.loc[shops['shop_city'] == '!Якутск', 'shop_city' ] = 'Якутск'
ls = LabelEncoder()
shops['shop_city'] = ls.fit_transform(shops['shop_city'])
shops['shop_type'] = shops['shop_name'].apply(lambda x: 'МТРЦ' if 'МТРЦ' in x 
                                              else 'ТРЦ' if 'ТРЦ' in x 
                                              else 'ТРК' if 'ТРК' in x 
                                              else 'ТЦ' if 'ТЦ' in x 
                                              else 'ТК' if 'ТК' in x 
                                              else 'NO_DATA')
ls1  = LabelEncoder()
shops['shop_type'] = ls1.fit_transform(shops['shop_type'])

ITEM_CAT FEATURES

Again, let's extract a main and a sub category from the name of the category.

In [None]:
main_cat = item_cats['item_category_name'].str.split('-').map(lambda row : row[0].strip())
main_cat.unique()

In [None]:
ls = LabelEncoder()
item_cats['main_cat'] = ls.fit_transform(main_cat)

sub_cat = item_cats['item_category_name'].str.split('-').map(lambda row : row[1].strip()
                                                             if len(row) > 1 
                                                             else row[0].strip()) 
item_cats['sub_cat'] = ls.fit_transform(sub_cat)

ITEMS FEATURES 

Different names features can be extracted from the items data.

In [None]:
items['name_1'], items['name_2'] = items['item_name'].str.split('[', 1).str
items['name_1'], items['name_3'] = items['item_name'].str.split('(', 1).str

items['name_2'] = items['name_2'].str.replace('[^A-Za-z0-9А-Яа-я]+', ' ').str.lower()
items['name_3'] = items['name_3'].str.replace('[^A-Za-z0-9А-Яа-я]+', ' ').str.lower()
items.drop('name_1', axis = 1 , inplace=True)
items = items.fillna('0')

ls = LabelEncoder()
items['name_2'] = ls.fit_transform(items['name_2'])
items['name_3'] = ls.fit_transform(items['name_3'])

We can remove the duplicate names by doing the following :

In [None]:
def name_correction(x):
    x = x.lower()
    x = x.partition('[')[0]
    x = x.partition('(')[0]
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x)
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

items['item_name'] = items['item_name'].apply(lambda x: name_correction(x))
items.head()
print('Unique item names after correction:', len(items['item_name'].unique()))

We first merge item and item_category for simplicity first. 

In [None]:
items = items.merge(item_cats.drop('item_category_name', axis=1),
                    on = ['item_category_id'], how = 'left')
items = items[['item_name','item_id','item_category_id','main_cat','sub_cat','name_2','name_3']]

### MERGING ###

In [None]:
## helper functions ##

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype in ["float64"]]
    int_cols =   [c for c in df if df[c].dtype in ["int64"]]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

def position_colum(df,column_name):
    '''
        return the position of the column in the dataset 
    '''
    assert column_name in df, "Column {} not in the dataset".format(column_name)
    colums = df.columns
    for i in range(len(colums)) :
        if (column_name == colums[i]):
            position = i
        
    return(position)

def mean_encoding_reg(df, col_to_enc,target_col, nb_split):
    '''
        return a new dataset where the categorical feature has been 
        encoded and replaced with KFOLD_mean_encoding
        col_to_enc : name of the colums to encode
    '''
    col = [col_to_enc,target_col]
    df_small = df[col]

    target_mean_fold_enc = []

    kf = KFold(n_splits=nb_split, shuffle=False)

    for ind_tr, ind_val in kf.split(df_small):
        mat_tr, mat_val = df_small.iloc[ind_tr], df_small.iloc[ind_val]
    
    
        target_tr_mean_estimate = mat_tr.groupby(col_to_enc)[target_col].mean()
        target_val_mean = df_small.loc[ind_val,col_to_enc].map(target_tr_mean_estimate)
    
        for mean in target_val_mean :
            target_mean_fold_enc.append(mean)
    
    pos_categorical_feat = position_colum(df,col_to_enc)
    new_df = df.copy()
    new_df.insert(pos_categorical_feat,col_to_enc +'_mean_enc',pd.Series(target_mean_fold_enc).fillna(0))
    
    return ((downcast_dtypes(new_df)))

Now we can expand the sales dataset. We first create a grid from all shops/items combinations from every month
For item_cnt_day we sum them to generate item_cnt_month values, the target.
For item_price we average it's values to calculate the item_price_month.

Also, we clip the target value between 0 & 20 as recommended on Kaggle.

In [None]:
## GRID DATA ###

from itertools import product, chain 
import gc

index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates == Target
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'item_cnt_month':'sum'},
                                                  'item_price' : {'item_price_month' : 'mean'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)


all_data['item_cnt_month'] = all_data['item_cnt_month'].clip(0,20)  ## CLIP TARGET VALUE ##

We can finally merge every datasets together and save it all.

In [None]:
all_data = all_data.merge(period, on = ['date_block_num'], how = 'left')
all_data = all_data.merge(shops.drop('shop_name', axis=1), on = ['shop_id'], how = 'left')
all_data = all_data.merge(items.drop('item_name', axis=1), on = ['item_id'], how ='left')

In [None]:
all_data = downcast_dtypes(all_data)

all_data.to_pickle('all_data.pkl')

del grid, gb , cur_items
gc.collect();

In [None]:
#all_data = pd.read_pickle('all_data.pkl')

At this step, the all_data dataset contains all the basic information (train and test data) summarized per month.

As item_price_month and item_cnt_month were made from data only available from the train data, we cannot use them for the prediction, but we can still count on generated features of past data. this means we can for example use the last 2 or 3 months of an item_id price of any feature combination( item_id and shop_city).

So in order to do this, let's define some functions to generate the lag of a column created by any kind of combination for the  month we want.

In [None]:
def aggregate(data, col_groupby = ['shop_id'],target_col = 'item_cnt_day',
              new_col_name='', agg_function = 'mean'):
    '''
    groupy data to get an aggregate of columns
    '''
    
    gb = data.groupby(col_groupby, as_index=False).agg({target_col:{new_col_name : agg_function}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    return (pd.merge(data, gb, how='left', on=col_groupby).fillna(0))

def create_lags(data, index_cols, column_to_lag, shift_range = [1]):
    '''
    make feature lags for a columns for a dataset groupby index_cols
    '''
    for month_shift in tqdm_notebook(shift_range) :
        train_shift = data[index_cols + [column_to_lag]].copy()
        train_shift = train_shift.drop_duplicates().reset_index(drop=True)
        
        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
        
        foo = lambda x : '{}_lag_{}'.format(x, month_shift) if x == column_to_lag else x
        train_shift = train_shift.rename(columns = foo)
             
        data = pd.merge(data, train_shift, on=index_cols, how='left').fillna(0)
        
      
        
    return data

def agg_and_lag(data, col_groupby, target_col, new_col_name, agg_function, shift_range):
    ''' 
    wrapper for aggregate and create_lags fonctions
    '''
    
    df_agg = aggregate(data=data, col_groupby = col_groupby,target_col=target_col,
                      new_col_name=new_col_name, agg_function=agg_function)
    
    df_lags = create_lags(data=df_agg, index_cols=col_groupby, 
                          column_to_lag=new_col_name,shift_range=shift_range)
    
    df_lags = df_lags.drop(new_col_name, axis = 1) 
    
    return df_lags

PRICE AGGREGATE 

First, let's make aggregate based on price :

In [None]:
all_data = agg_and_lag(all_data,col_groupby=['date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_price_mean', 
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['item_id','date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_item_price_mean', 
                       agg_function='mean',shift_range=[1,2])

all_data = agg_and_lag(all_data,col_groupby=['shop_id','date_block_num'],
                       target_col='item_price_month', 
                       new_col_name='date_shop_price_mean', 
                       agg_function='mean',shift_range=[1,2])

all_data = agg_and_lag(all_data,col_groupby=['sub_cat','shop_id','date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_shop_subcat_price_mean',
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['item_category_id','date_block_num'],
                       target_col='item_price_month', 
                       new_col_name='date_itemcat_price_mean',
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['main_cat','date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_maincat_price_mean',
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['shop_type','date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_shop_type_price_mean',
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['item_id','shop_city','date_block_num'],
                       target_col='item_price_month',
                       new_col_name='date_item_shopcity_price_mean',
                       agg_function='mean',shift_range=[1])

ITEM_CNT_MONTH AGGREGATE 

Then, the aggregates based on the item_cnt_month feature :

In [None]:
all_data = agg_and_lag(all_data,col_groupby=['item_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_item_month_mean',
                       agg_function='mean', shift_range=[1,2])


all_data = agg_and_lag(all_data,col_groupby=['shop_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_shop_month_mean', 
                       agg_function='mean', shift_range=[1,2])

all_data = agg_and_lag(all_data,col_groupby=['item_category_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_itemcat_month_mean', 
                       agg_function='mean', shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['shop_id','item_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_item_shop_month_mean', 
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['sub_cat','shop_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_item_subcat_month_mean', 
                       agg_function='mean', shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['main_cat','shop_id','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_item_maincat_month_mean',
                       agg_function='mean', shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['item_id','shop_city','date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_item_shopcity_month_mean',
                       agg_function='mean',shift_range=[1])

all_data = agg_and_lag(all_data,col_groupby=['date_block_num'],
                       target_col='item_cnt_month',
                       new_col_name='date_month_mean',
                       agg_function='mean', shift_range=[1])

Finally, we create lags for the target itself and item_price_month :

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

all_data = create_lags(all_data, index_cols=index_cols,
                       column_to_lag='item_cnt_month', shift_range=[1,2,3,12])
all_data = create_lags(all_data, index_cols=index_cols,
                       column_to_lag='item_price_month', shift_range=[1,2])

In [None]:
del sales, item_cats, items

We don't need to use data from the first 3 months since we created lags.

In [None]:
all_data = all_data[all_data['date_block_num'] >= 3]  

gc.collect();

In [None]:
all_data.to_pickle('all_data_lags.pkl')

In [None]:
#all_data = pd.read_pickle('all_data_lags.pkl')

I tried mean encoding for categorical variables but I ended up not using it because of more overfitting.

In [None]:
#all_data_enc = mean_encoding_reg(all_data,'shop_city','item_cnt_month', 5)
#all_data_enc = mean_encoding_reg(all_data_enc,'main_cat','item_cnt_month', 5)
#all_data_enc = mean_encoding_reg(all_data_enc,'month','item_cnt_month', 5)
#all_data_enc = mean_encoding_reg(all_data_enc,'item_category_id','item_cnt_month', 5)

#all_data_enc.to_pickle('all_data_enc.pkl')

In [None]:
#all_data_enc = pd.read_pickle('all_data_enc.pkl')

We store the target and date and then delete from all_data.

In [None]:
date = all_data['date_block_num']
target = all_data['item_cnt_month']

In [None]:
all_data = all_data.drop('item_cnt_month', axis = 1)
all_data = all_data.drop('item_price_month', axis = 1 )

REMOVING FEATURE TOO CORRELATED / USELESS ONE

In [None]:
 def correlatedDropper( df_data, thresh = 0.95):

        """
        Remove columns too correlated from a ``pandas.DataFrame

        """
        df = df_data.copy()
        corr_matrix = df.corr()

        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        cols_to_drop = [column for column in upper.columns if any(upper[column] > thresh)]

        df = df.drop(cols_to_drop, axis=1)
        return df, cols_to_drop

In [None]:
test, col_drop = correlatedDropper(all_data, 0.95)

In [None]:
col_drop

After test, I decided to keep it.

 # **Splitting the dataset in train / validation / test set**

The last month 34 correspond to the test set. We'll be using the last month - 1 as the validation set since it's time data.

In [None]:
last_block_num = 33

X_test = all_data[date == 34]
X_valid = all_data[date == last_block_num]
X_train = all_data[date < last_block_num]

y_train = target.loc[date < last_block_num].values
y_valid = target.loc[date == last_block_num].values


categorical_feat_ind = np.where(X_train.dtypes != np.float32)[0]
print(categorical_feat_ind)

#Pool1 = Pool(X_train, y_train, categorical_feat_ind)

In [None]:
del all_data, target, date

gc.collect()

# **Bayesian optimization for hyperparameters**

I found this idea thanks to Mazoub parpanchi who implemented it for lgb  in his great kernel : https://www.kaggle.com/masoudmzb/bayesian-optimization-for-lgb
Give it a look !        

Instead of using gridSearchCV , I decided to use Bayesian Optimization. Bayesian Optimization enables to find rapidly good hyperparameters in less than 10 iterations using Gausian processes. For this, we have to define a function , and the algorithm will try to find the best maxima of this function out of several parameters.
Here, the function is basically the catboost model which output the RMSE and the parameters are the hyperparameters ( like the bagging temperature, l2_reg..).
The library is only defined to maximize the output so here the output will be -RMSE since we want to minimize it !.
Bellow is the link of the repo for a more detailled explanation : 
https://github.com/fmfn/BayesianOptimization.
Using GPU is of course the best choice.
                                                                                                                        

In [None]:
def catboost_rmse(X_train,y_train,
                  X_valid, y_valid,
                  loss_function='RMSE',
                  iterations=400,
                  random_seed=0,
                  learning_rate=0.15,
                  depth=5,
                  l2_leaf_reg=1,
                  one_hot_max_size=200,
                  min_data_in_leaf = 1,
                  bagging_temperature = 1,
                  border_count = 32,
                  max_ctr_complexity = 2
               ) :
    
    ''' 
    fonction to optimize, here the catboost RMSE 
    '''
    
    catmodel = CatBoostRegressor( 
    loss_function= loss_function,
    iterations=iterations,
    random_seed=random_seed,
    learning_rate=learning_rate,
    depth=depth,
    l2_leaf_reg=l2_leaf_reg,
    one_hot_max_size=one_hot_max_size,
    min_data_in_leaf = min_data_in_leaf,
    bagging_temperature= bagging_temperature,        
    border_count=border_count,
    max_ctr_complexity=max_ctr_complexity,
    thread_count=-1,
    od_type='Iter',
    od_wait= 20,
    task_type='GPU',
    devices='0',  
    verbose = 100
    )

    catmodel.fit(X_train,
                 y_train,
                 eval_set=(X_valid,y_valid),
                 plot=False
            )

    predict = catmodel.predict(X_valid).clip(0,20)
    
    best_rmse_train = catmodel.best_score_['learn']['RMSE']
    best_rmse_valid = catmodel.best_score_['validation']['RMSE']
        
        
    print('train_rmse : {:.4f}, test_rmse : {:.4f}'.format(best_rmse_train, best_rmse_valid))
        
        
    r2 = r2_score(y_valid,predict)
        
    print('the R2 score is {:.4f}'.format(r2))
    
    del predict
   
    
    return - best_rmse_valid

In [None]:
def optimize_catboost(X_train,y_train,X_valid,y_valid, param_probe=None) :
    '''
    return the optimizer  and output the best hyperparameters found
    param_probe : dict of hyperparameters in order to guide the optimization
    '''
    def catboost_wrapper(learning_rate, depth, one_hot_max_size, 
                         min_data_in_leaf, bagging_temperature,
                         border_count, max_ctr_complexity,
                         loss_function = 'RMSE', l2_leaf_reg = 1, 
                         random_seed = 0,  iterations = 1000) :
        '''
        wrapper for the function to optimize
        '''
        
        return catboost_rmse(X_train=X_train, y_train=y_train ,
                             X_valid=X_valid ,y_valid=y_valid, 
                             iterations = int(iterations),
                             random_seed = random_seed, 
                             learning_rate=learning_rate, 
                             depth=int(depth),
                             l2_leaf_reg=l2_leaf_reg,
                             one_hot_max_size= int(one_hot_max_size), 
                             min_data_in_leaf= int(min_data_in_leaf),
                             bagging_temperature = bagging_temperature,
                             border_count = int(border_count), 
                             max_ctr_complexity = int(max_ctr_complexity)
              )
    
    optimizer = BayesianOptimization(
        f = catboost_wrapper,
        pbounds= {
            "learning_rate" : (0.01,0.50),
            "depth" : (2,10),
            "l2_leaf_reg" : (1,50),
            "one_hot_max_size" : (2,230),
            "min_data_in_leaf" : (1,10),
            "bagging_temperature" : (0,100),
            "border_count" : (30, 250),
            "max_ctr_complexity" : (1,4)          
        },
        random_state = 63,
        verbose = 30,
    )
    if param_probe != None :
        optimizer.probe(
        params= param_probe,
        lazy=True,
        )
    
    optimizer.maximize(n_iter=3,
                      init_points=5)
    
    print("Final result:", optimizer.max)
    return optimizer

Since I ran this a lot, I found thoses parameters bellow. Let's probe it to the optimizer in order to guide the optimization and see if we can find better parameters. 

In [None]:
params_probe = {'bagging_temperature': 0.28300538794137076, 
                'border_count': 247.51179202909668, 
                'depth': 8.478936311173683, 
                'l2_leaf_reg': 7.73213368476749,
                'learning_rate': 0.10290800942292017,
                'max_ctr_complexity': 2.9452851946784433, 
                'min_data_in_leaf': 2.383458113416781, 
                'one_hot_max_size': 209.4502766887087}

In [None]:
print(Colours.green(""" -- Optimizing Catboost Parameters -- """))
optimizer_cat = optimize_catboost(X_train=X_train, y_train=y_train, 
                                  X_valid=X_valid, y_valid=y_valid,
                                  param_probe=params_probe)

Not for this run, so let's troncate the int parameters and add the constant ones. We stop the training after 8 iterations without test RMSE improving. You can of course increase the number of iterations for a better mapping of the objective function.

In [None]:
params_catboost =  {'bagging_temperature': 0.28300538794137076,
                    'border_count': 247, 
                    'depth': 8, 
                    'l2_leaf_reg': 7.73213368476749,
                    'learning_rate': 0.10290800942292017,
                    'max_ctr_complexity': 2, 
                    'min_data_in_leaf': 2.383458113416781, 
                    'one_hot_max_size': 209,
                    'random_seed' : 0,
                    'thread_count' : -1,
                    'task_type' : 'GPU',
                    'devices' : '0', 
                    'od_type' : 'Iter',
                    'od_wait' :  20,
                    'verbose' : 50
                   }

In [None]:
catmodel = CatBoostRegressor(**params_catboost)

catmodel.fit(X_train,y_train,
             eval_set=(X_valid,y_valid),
             plot=False
            )

predict = catmodel.predict(X_valid).clip(0,20)
r2 = r2_score(y_valid,predict)
print( 'r2_score est {}'.format(r2))

 # **Feature Selection **

Since X has a lot of features, we can reduce them in order to reduce overfitting.

In [None]:
def get_feature_imp(catmodel, method, X_train, y_train, X_test, y_test, plot = False):
    '''
    return the importance of the feature based on a specific method
    method : - "PredictionValuesChange"
             - "LossFunctionChange"
    '''
    
    fi = catmodel.get_feature_importance(Pool(X_test, label=y_test), type=method)
        
    feature_score = pd.DataFrame(list(zip(X_test.dtypes.index, fi )),
                                        columns=['Feature','Score'])

    feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False,
                                              kind='quicksort', na_position='last')
    if plot : 
        plt.rcParams["figure.figsize"] = (12,7)
        ax = feature_score.plot('Feature', 'Score', kind='bar', color='c')
        ax.set_title("Feature Importance using {}".format(method), fontsize = 14)
        ax.set_xlabel("features")
        plt.show()   
    return feature_score

In [None]:
feature_pred = get_feature_imp(catmodel=catmodel, method='PredictionValuesChange', 
                               X_train=X_train, y_train=y_train,
                               X_test=X_valid, y_test=y_valid,
                               plot=True)

In [None]:
feature_useless_pred = feature_pred[feature_pred['Score'] < 0.01]   #0.1
feature_useless_pred.Feature

I decided to remove the feature with virtually no impact on the prediction.

In [None]:
X_train_pred = X_train.drop(feature_useless_pred.Feature[:-1], axis = 1)
X_valid_pred = X_valid.drop(feature_useless_pred.Feature[:-1], axis = 1)
X_test_pred = X_test.drop(feature_useless_pred.Feature[:-1], axis = 1) 
### I kept date_block_num after tests

Bellow another method in order to perform feature selection, but I didn't use it eventually as I performed worse in the LB with this one.

In [None]:
feature_loss = get_feature_imp(catmodel,'LossFunctionChange',
                               X_train, y_train, 
                               X_valid, y_valid,
                               plot=True)

In [None]:
feature_useless_loss = feature_loss[feature_loss['Score'] < 0.0]
feature_useless_loss.Feature

PREDICTION VALUE CHANGE

Let's run a model based on this shrunken X.

In [None]:
catmodel_pred = CatBoostRegressor(**params_catboost)

catmodel_pred.fit(X_train_pred,y_train,
             eval_set=(X_valid_pred,y_valid),
             plot=False
            )

predict = catmodel_pred.predict(X_valid_pred).clip(0,20)
r2 = r2_score(y_valid,predict)
print( 'r2_score est {}'.format(r2))

The RMSE test decreased and the train RMSE increased. So that's exactly what we wanted since there is less overfitting. My LB improved after this.
We can try to improve it by running an another optimization. For me, even if the RMSE decreases, it did not improve in the LB but you can try it by yourself.

In [None]:
print(Colours.green(""" -- Optimizing Catboost Parameters -- """))
optimizer_cat_loss = optimize_catboost(X_train=X_train_pred, y_train=y_train,
                                       X_valid=X_valid_pred, y_valid=y_valid,
                                       param_probe=params_probe)

Now, let's remove the old X and save the new one.

In [None]:
del X_train, X_valid

In [None]:
X_train_pred.to_pickle('X_train_pred.pkl')
X_valid_pred.to_pickle('X_valid_pred.pkl')

In [None]:
#X_train_pred = pd.read_pickle('X_train_pred.pkl')
#X_valid_pred = pd.read_pickle('X_valid_pred.pkl')

![](http://)To check the size of your variables, if you have memory issue.

# **Ensembling and final prediction**

For emsembling, I use the simple Bagging where models are the same except for the random seed.
After using 10 differents seeds, the overall prediction performed worse so I decided to only choose the best seeds.

In [None]:
def find_top_seeds(X_train,y_train,X_valid,y_valid,params, range_min = 0, range_max = 10):
    
    
    metric_dict = pd.DataFrame(columns=['state','rmse'])
                                  
    for i in range(range_min,range_max) :
        print('--- random state : {} ---\n'.format(i))
        
        params['random_seed'] = i 
        catmodel = CatBoostRegressor(**params)
        catmodel.fit(X_train,y_train,
             eval_set=(X_valid,y_valid),
             plot=False
            )
        
        best_rmse_valid = catmodel.best_score_['validation']['RMSE']
        
        metric_dict.loc[i-range_min] = [int(i)] + [best_rmse_valid]
        metric_dict['state'] = metric_dict['state'].astype(np.int32)
        
    return metric_dict.sort_values(by='rmse', ascending = 'False')

In [None]:
metric = find_top_seeds(X_train_pred,y_train,X_valid_pred, y_valid,
                        params=params_catboost, range_min=0, range_max=10)

In [None]:
plt.rcParams["figure.figsize"] = (7,7)
ax = metric.plot('state', 'rmse', kind='bar', color='c')
ax.set_title("rmse en fonction du shuffling", fontsize = 10)
ax.set_xlabel("state")
plt.show()  

After submitting to kaggle several times, I only kept seeds  4,6,8.

In [None]:
predictions_test = []   # 0.92949 for 2,4,6,8
   
for i in [4,8,6]:
    print('random state {}'.format(i))
    
    params_catboost['random_seed'] = i
    
    catmodel_bagging_pred = CatBoostRegressor(**params_catboost)
    catmodel_bagging_pred.fit(X_train_pred,y_train,
             eval_set=(X_valid_pred,y_valid),
             plot=False
            )

    predict = catmodel_bagging_pred.predict(X_valid_pred).clip(0,20)
    r2 = r2_score(y_valid,predict)
    print( 'r2_score est {}'.format(r2))
    
    predictions_test.append(catmodel_bagging_pred.predict(X_test_pred).clip(0,20))

In [None]:
predictions =  np.mean(np.array(predictions_test), axis = 0)

In [None]:
dt_test_opt = pd.DataFrame()
dt_test_opt['item_cnt_month'] = predictions
dt_test_opt.index.name='ID'
dt_test_opt.to_csv('submission_featurev11_bagging_[4_6_8].csv')