In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
import pickle
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [20]:
transactions    = pd.read_csv('sales_train.csv.gz')
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
test            = pd.read_csv('test.csv.gz')

In [21]:
INDEX_COLS = ['item_id', 'shop_id', 'date_block_num']

In [22]:
transactions = pd.merge(transactions, items, on='item_id', how='left')
transactions = transactions.drop('item_name', axis=1)
transactions = transactions[transactions['date_block_num'] > 12]
transactions.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
1366911,01.02.2014,13,27,15242,699.0,1.0,63
1366912,25.02.2014,13,27,15200,299.0,1.0,69
1366913,19.02.2014,13,27,15279,799.0,1.0,63
1366914,26.02.2014,13,27,15202,299.0,1.0,69
1366915,01.02.2014,13,27,14888,549.0,1.0,55


In [23]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [24]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [25]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [26]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [27]:
all_combos = pd.merge(all_combos, transactions[['item_price', 'item_id', 'shop_id']], on=['item_id', 'shop_id'], how='left')

In [28]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,item_price
0,0,54,13,40,58.0
1,0,54,14,40,58.0
2,0,54,15,40,58.0
3,0,54,16,40,58.0
4,0,54,17,40,58.0


In [29]:
transactions['y'] = transactions.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].transform(np.sum).clip(0,20)

In [30]:
training = pd.merge(all_combos,transactions,on=['item_id', 'shop_id', 'date_block_num'],how='left')

In [32]:
training.drop('date',inplace=True, axis=1)

In [34]:
training.drop('item_category_id_y', inplace=True, axis=1)

In [33]:
training.drop('item_price_y', inplace=True, axis=1)

In [35]:
training.rename(columns={'item_category_id_x': 'item_category_id'}, inplace=True)

In [36]:
training.rename(columns={'item_price_x': 'item_price'}, inplace=True)

In [37]:
training.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,item_price,item_cnt_day,y
0,0,54,13,40,58.0,,
1,0,54,14,40,58.0,,
2,0,54,15,40,58.0,,
3,0,54,16,40,58.0,,
4,0,54,17,40,58.0,,


In [67]:
gc.collect()

transactions['item_cnt_sum'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['item_cnt_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['item_cnt_min'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['item_cnt_max'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['item_price_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_cnt_sum'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_cnt_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_cnt_min'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['shop_cnt_max'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['shop_price_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean)

transactions['category_cnt_sum'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['category_cnt_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['category_cnt_min'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['category_cnt_max'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['category_price_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_category_cnt_sum'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_category_cnt_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_category_cnt_min'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['shop_category_cnt_max'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['shop_category_price_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_price'].transform(np.mean)

In [65]:
transactions.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_category_id', 'y', 'item_cnt_sum',
       'item_cnt_mean', 'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean',
       'shop_price_mean', 'category_cnt_sum', 'category_cnt_mean',
       'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_price_mean'],
      dtype='object')

In [16]:
training['y'] = training['y'].fillna(0)

In [17]:
training = training.sample(frac=1)

In [18]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

training_int = training.select_dtypes(include=['int'])
converted_int = training_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(training_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([training_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

294.60 MB
95.74 MB


Unnamed: 0,before,after
uint8,,3.0
uint16,,1.0
int64,4.0,


In [19]:
training_float = training.select_dtypes(include=['float'])
converted_float = training_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(training_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([training_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

942.71 MB
500.81 MB


Unnamed: 0,before,after
float32,,15.0
float64,15.0,


In [20]:
optimized_training = training.copy()

optimized_training[converted_int.columns] = converted_int
optimized_training[converted_float.columns] = converted_float

print(mem_usage(training))
print(mem_usage(optimized_training))

1443.04 MB
802.29 MB


In [21]:
training = optimized_training
del optimized_training
gc.collect()

14

In [68]:
lag_columns = training.columns[8:]
lags = [1,2,3,6,12]

In [69]:
lag_columns

Index(['item_cnt_sum', 'item_cnt_mean', 'item_price_mean', 'shop_cnt_sum',
       'shop_cnt_mean', 'shop_price_mean', 'category_cnt_sum',
       'category_cnt_mean', 'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'item_cnt_mean_lag_1', 'item_price_mean_lag_1',
       'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1', 'shop_price_mean_lag_1',
       'category_cnt_sum_lag_1', 'category_cnt_mean_lag_1',
       'category_price_mean_lag_1', 'shop_category_cnt_sum_lag_1',
       'shop_category_cnt_mean_lag_1', 'shop_category_price_mean_lag_1',
       'item_cnt_sum_lag_2', 'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'shop_category_cnt_sum_lag_2',
       'shop_category_cnt_mean_lag_2', 'shop_category_price_mean_lag_2',
       'item_cnt_

In [24]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    training['lagged_block'] = training['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    training = pd.merge(training,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
2
3
6
12


In [25]:
len(training)

7722666

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training[(training['item_id'] == 30) & (training['shop_id'] == 30)].sort_values(by='date_block_num')

In [70]:
training.columns

Index(['item_id', 'shop_id', 'date_block_num', 'item_category_id', 'date',
       'item_price', 'item_cnt_day', 'y', 'item_cnt_sum', 'item_cnt_mean',
       'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean', 'shop_price_mean',
       'category_cnt_sum', 'category_cnt_mean', 'category_price_mean',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_price_mean', 'item_cnt_sum_lag_1', 'item_cnt_mean_lag_1',
       'item_price_mean_lag_1', 'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'category_price_mean_lag_1',
       'shop_category_cnt_sum_lag_1', 'shop_category_cnt_mean_lag_1',
       'shop_category_price_mean_lag_1', 'item_cnt_sum_lag_2',
       'item_cnt_mean_lag_2', 'item_price_mean_lag_2', 'shop_cnt_sum_lag_2',
       'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'sho

In [27]:
import pickle as pickle

pickle.dump(training, open( "training", "wb"), protocol=4)

#training = pickle.load( open( "training", "rb" ) )

In [None]:
training.drop('lagged_block', inplace=True, axis=1)

In [47]:
lgbm_features = training.columns[17:]
lgbm_features = np.append( ['item_id', 'shop_id', 'item_category_id', 'date_block_num'], lgbm_features)
lgbm_features

array(['item_id', 'shop_id', 'item_category_id', 'date_block_num',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_price_mean', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_price_mean_lag_1',
       'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'category_price_mean_lag_1',
       'shop_category_cnt_sum_lag_1', 'shop_category_cnt_mean_lag_1',
       'shop_category_price_mean_lag_1', 'item_cnt_sum_lag_2',
       'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2',
       'shop_price_mean_lag_2', 'category_cnt_sum_lag_2',
       'category_cnt_mean_lag_2', 'category_price_mean_lag_2',
       'shop_category_cnt_sum_lag_2', 'shop_category_cnt_mean_lag_2',
       'shop_category_price_mean_lag_2', 'item_cnt_sum_lag_3',
       'item_cnt_mean_lag_3', 'item_price_mean_lag_3',
       'shop_cnt_sum_lag_3', 'shop_cnt_me

In [54]:
lgbm_features = ['item_id', 'shop_category_cnt_mean', 'date_block_num',
       'shop_category_cnt_sum', 'shop_id', 'item_cnt_mean_lag_1',
       'item_category_id', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'item_price_mean_lag_1']

In [55]:
categorical_features_indices = [0,2,4,6]

In [49]:
x_train = training[training['date_block_num'] < 32]

non_zeros_train = x_train[x_train['y'] != 0]
zeros_train = x_train[x_train['y'] == 0]
zeros_keep_train = zeros_train.sample(int(len(non_zeros_train)*0.1))
x_train = non_zeros_train.append(zeros_keep_train)

y_train = x_train['y']


val = training[training['date_block_num'].isin([32,33])]
non_zeros_val = val[val['y'] != 0]
zeros_val = val[val['y'] == 0]
zeros_keep_val = zeros_val.sample(int(len(non_zeros_val)*0.5))
x_val = non_zeros_val.append(zeros_keep_val)
y_val = x_val['y']


In [45]:
columns = training.columns
percent_missing = training.isna().sum() * 100 / len(training)
pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

Unnamed: 0,column_name,percent_missing
item_id,item_id,0.000000
shop_id,shop_id,0.000000
date_block_num,date_block_num,0.000000
item_category_id,item_category_id,0.000000
date,date,88.759322
item_price,item_price,88.759322
item_cnt_day,item_cnt_day,88.759322
y,y,0.000000
item_cnt_sum,item_cnt_sum,88.759322
item_cnt_mean,item_cnt_mean,88.759322


In [56]:
cb_model = CatBoostRegressor(iterations=10000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             #task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                            od_type = "Iter",
                            od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 23)


cb_model.fit(x_train[lgbm_features], y_train, cat_features=categorical_features_indices,
             eval_set=(x_val[lgbm_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 3.2514231	test: 2.8014817	best: 2.8014817 (0)	total: 736ms	remaining: 2h 2m 39s
1:	learn: 3.2343019	test: 2.7875272	best: 2.7875272 (1)	total: 1.42s	remaining: 1h 58m 11s
2:	learn: 3.2176510	test: 2.7737440	best: 2.7737440 (2)	total: 2.18s	remaining: 2h 59s
3:	learn: 3.2010470	test: 2.7609437	best: 2.7609437 (3)	total: 2.92s	remaining: 2h 1m 31s
4:	learn: 3.1847869	test: 2.7473764	best: 2.7473764 (4)	total: 3.48s	remaining: 1h 55m 58s
5:	learn: 3.1687048	test: 2.7343785	best: 2.7343785 (5)	total: 4.21s	remaining: 1h 56m 45s
6:	learn: 3.1528034	test: 2.7218842	best: 2.7218842 (6)	total: 4.82s	remaining: 1h 54m 47s
7:	learn: 3.1372456	test: 2.7090952	best: 2.7090952 (7)	total: 5.5s	remaining: 1h 54m 26s
8:	learn: 3.1219054	test: 2.6967922	best: 2.6967922 (8)	total: 6.07s	remaining: 1h 52m 21s
9:	learn: 3.1068597	test: 2.6843942	best: 2.6843942 (9)	total: 6.73s	remaining: 1h 52m 6s
10:	learn: 3.0921314	test: 2.6724134	best: 2.6724134 (10)	total: 7.33s	remaining: 1h 50m 53s
11:	l

KeyboardInterrupt: 

In [144]:
lgbm_features[np.argsort(cb_model.get_feature_importance())[::-1]]

array(['item_id', 'shop_category_cnt_mean', 'date_block_num',
       'shop_category_cnt_sum', 'shop_id', 'item_cnt_mean_lag_1',
       'item_category_id', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'item_price_mean_lag_1',
       'category_cnt_sum_lag_3', 'category_cnt_sum_lag_6',
       'category_price_mean_lag_1', 'category_cnt_sum_lag_2',
       'shop_category_cnt_sum_lag_1', 'shop_cnt_sum_lag_6',
       'shop_category_cnt_mean_lag_1', 'lagged_block',
       'category_cnt_mean_lag_6', 'shop_cnt_sum_lag_3',
       'item_cnt_sum_lag_2', 'shop_cnt_sum_lag_1',
       'category_cnt_mean_lag_2', 'item_cnt_sum_lag_3',
       'category_cnt_mean_lag_3', 'shop_category_cnt_mean_lag_3',
       'shop_category_price_mean_lag_1', 'item_cnt_mean_lag_3',
       'shop_cnt_sum_lag_2', 'item_price_mean_lag_2',
       'item_cnt_mean_lag_2', 'shop_cnt_mean_lag_12',
       'shop_category_price_mean_lag_3', 'item_price_mean_lag_3',


In [47]:
len(training.item_category_id.unique())

79

In [None]:
print('Plotting feature importances...')
ax = lgbm.plot_importance(model_lgb, max_num_features=10)
plt.show()

In [None]:
best_features_indices = np.argsort(model_lgb.feature_importance())[::-1][0:5]
lgbm_features[best_features_indices]

In [150]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['date_block_num'] = 34

In [156]:
lag_columns

Index(['item_cnt_sum', 'item_cnt_mean', 'item_price_mean', 'shop_cnt_sum',
       'shop_cnt_mean', 'shop_price_mean', 'category_cnt_sum',
       'category_cnt_mean', 'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_price_mean'],
      dtype='object')

In [151]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    test_w_cat_ids['lagged_block'] = test_w_cat_ids['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    test_w_cat_ids = pd.merge(test_w_cat_ids,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
2
3
6
12


In [155]:
test_w_cat_ids.columns

Index(['ID', 'item_id', 'shop_id', 'item_name', 'item_category_id',
       'date_block_num', 'lagged_block', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_price_mean_lag_1', 'shop_cnt_sum_lag_1',
       'shop_cnt_mean_lag_1', 'shop_price_mean_lag_1',
       'category_cnt_sum_lag_1', 'category_cnt_mean_lag_1',
       'category_price_mean_lag_1', 'shop_category_cnt_sum_lag_1',
       'shop_category_cnt_mean_lag_1', 'shop_category_price_mean_lag_1',
       'item_cnt_sum_lag_2', 'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'shop_category_cnt_sum_lag_2',
       'shop_category_cnt_mean_lag_2', 'shop_category_price_mean_lag_2',
       'item_cnt_sum_lag_3', 'item_cnt_mean_lag_3', 'item_price_mean_lag_3',
       'shop_cnt_sum_lag_3', 'shop_cnt_mean_lag_3', 'shop_price_mean_lag_3',
       'category_cnt_sum_lag_

In [None]:
len(test_w_cat_ids)

In [None]:
preds = model_lgb.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

In [154]:
preds = cb_model.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

KeyError: "['shop_category_cnt_mean' 'shop_category_cnt_sum'\n 'shop_category_price_mean'] not in index"

In [60]:
np.sum(preds)

96340.428045824

In [57]:
from sklearn import preprocessing
x_train_scaled = x_train[lgbm_features].copy()
x_train_scaled.fillna(0,inplace=True)
x_val_scaled = x_val[lgbm_features].copy()
x_val_scaled.fillna(0,inplace=True)


x_train_scaled = preprocessing.scale(x_train_scaled)
x_val_scaled = preprocessing.scale(x_val_scaled)

In [60]:
lr_model = LinearRegression().fit(x_train_scaled, y_train)
lr_val_preds = lr_model.predict(x_val_scaled).clip(0,20)
rmse = sqrt(mean_squared_error(y_val, lr_val_preds))
print("rmse: ", rmse)

rmse:  2.205736313122704


In [63]:
rf_model = RandomForestRegressor(random_state=0, n_jobs = 8, verbose=1).fit(x_train_scaled, y_train)

[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:   29.8s remaining:   19.9s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:   37.4s finished


In [64]:
rf_val_preds = rf_model.predict(x_val_scaled).clip(0,20)
rmse = sqrt(mean_squared_error(y_val, rf_val_preds))
print("rmse: ", rmse)

rmse:  3.0751675439681896


[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.1s finished


In [61]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)