**Import Library**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from itertools import product
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Any results you write to the current directory are saved as output.

In [None]:
!pip install pymorphy2
!pip install pymorphy2-dicts
!pip install DAWG-Python

In [None]:
import pymorphy2

In [None]:
sample_submissions=pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
sales_train_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
items_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
shops_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
item_categories_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")

We look at the data in the tables

In [None]:
sample_submissions.head()

In [None]:
sales_train_df.head()

In [None]:
sales_train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
sales_train_df.describe()

In [None]:
test_df.describe()

It can be seen from these tables that in the training set, shop_id changes from 0 to 59, and on the test set it changes from 2 to 59. Also, item_id on the training set changes from 0 to 22169, and on the test set from 30 to 22167. Therefore, these indices are not match. We’ll check this more carefully.

In [None]:
print(sales_train_df["shop_id"].nunique())
print(test_df["shop_id"].nunique())

In [None]:
print(sales_train_df["item_id"].nunique())
print(test_df["item_id"].nunique())

In [None]:
plt.figure(figsize=(20,6))
sales_train_df["item_cnt_day"].hist(bins=200)

From this histogram we can see that most daily sales are in the range close to one.But on some items, sales are 2,000 or higher, and this may be an outlier.

In [None]:
plt.figure(figsize=(20,6))
sales_train_df["item_price"].hist(bins=300)

From this histogram we can see that most of the prices for items are in the range from 0 to about 1000. But there are also items with huge prices from 10000 to 300000. It can be outliers or just items with high price.

In [None]:
plt.figure(figsize=(20,6))
sales_train_df["shop_id"].hist(bins=60)

In [None]:
plt.figure(figsize=(20,6))
test_df["shop_id"].hist(bins=60)

In [None]:
plt.figure(figsize=(20,6))
sales_train_df["item_id"].hist(bins=1000)

In [None]:
plt.figure(figsize=(20,6))
test_df["item_id"].hist(bins=1000)

In [None]:
nltk.download("stopwords")

In [None]:
def text_preprocessing(text):
  txt=""
  morph=pymorphy2.MorphAnalyzer()
  stop_words_ru=stopwords.words("russian")
  stop_words_eng=stopwords.words("english")
  stroka=re.sub('[^A-Za-zА-Яа-я]+',' ',text)
  strs=stroka.split(" ")
  filtered_strs=[morph.parse(w.lower())[0].normal_form for w in strs if (w not in stop_words_ru and w not in stop_words_eng)]
  txt=" ".join(filtered_strs)
  return txt

In [None]:
'''%%time
items_df["item_name"]=items_df["item_name"].apply(lambda x:text_preprocessing(x))'''

In [None]:
items_df["item_name"]

In [None]:
sales_train_df=sales_train_df.join(shops_df.set_index("shop_id"),on="shop_id",how="left").join(
items_df.set_index("item_id"),on="item_id",how="left").join(item_categories_df.set_index("item_category_id")
                                                              ,on="item_category_id",how="left")
sales_train_df.head().T

In [None]:
test_df=test_df.join(shops_df.set_index("shop_id"),on="shop_id",how="left").join(
items_df.set_index("item_id"),on="item_id",how="left").join(item_categories_df.set_index("item_category_id")
                                                              ,on="item_category_id",how="left")
test_df.head().T

In [None]:
plt.figure(figsize=(20,6))
sales_train_df["item_category_id"].hist(bins=84)

In [None]:
plt.figure(figsize=(20,6))
test_df["item_category_id"].hist(bins=84)

The graphs show that to build a test set for each shop, we took all all the item-shop pairs that are present in the test dataset. You can build such a dataset from the training set by taking for each month all the unique item-shop pairs present in the test dataset.

In [None]:
sns.relplot(x="item_price",y="item_cnt_day",height=9,aspect=1,data=sales_train_df)

From this graph we see that in the dataset there are two points that can be outliers. One with sales over 2000. Another with a price of over 50000.

In [None]:
sales_train_df[sales_train_df.item_cnt_day>2000]

In [None]:
sales_train_df[sales_train_df.item_category_id==9]["item_cnt_day"].mean()

In [None]:
sales_train_df[sales_train_df.item_id==11373]["item_cnt_day"].mean()

In [None]:
sales_train_df[(sales_train_df.shop_id==12)&(sales_train_df.item_id==11373)]["item_cnt_day"].mean()

It seems that this point is really an outlier.

In [None]:
sales_train_df=sales_train_df[~sales_train_df.isin(sales_train_df[sales_train_df.item_cnt_day>2000])]

In [None]:
sales_train_df[sales_train_df.item_price>50000]

In [None]:
sales_train_df=sales_train_df[~sales_train_df.isin(sales_train_df[sales_train_df.item_price>50000])]

In [None]:
sns.relplot(x="item_price",y="item_cnt_day",height=9,aspect=1,data=sales_train_df)

In [None]:
sales_train_df[sales_train_df.item_cnt_day>800]

In [None]:
sales_train_df[sales_train_df.item_id==20949]["item_cnt_day"].mean()

In [None]:
sales_train_df[(sales_train_df.shop_id==12)&(sales_train_df.item_id==20949)]["item_cnt_day"].mean()

In [None]:
sales_train_df=sales_train_df[~sales_train_df.isin(sales_train_df[sales_train_df.item_cnt_day>800])]

In [None]:
sns.relplot(x="item_price",y="item_cnt_day",height=9,aspect=1,data=sales_train_df)

You can try to add pairs (item, shop) to each month and encode them using the median of sales for the category to which this item belongs in the corresponding month. To do this, join all the datasets with each other.

In [None]:
item_id_uniq=pd.unique(sales_train_df["item_id"])

In [None]:
test_unique_items=test_df[~test_df["item_id"].isin(item_id_uniq)]
test_unique_items.head()

In [None]:
test_unique_items.shape

In [None]:
test_unique_items["item_id"].nunique()

In [None]:
sales_train_df.shape

In [None]:
%%time
train=[]
col=["date_block_num","shop_id","item_id"]
for i in range(34):
  sales=sales_train_df[sales_train_df.date_block_num==i]
  train.append(np.array(list(product([i],sales.shop_id.unique(),sales.item_id.unique())),dtype="int16"))
train=pd.DataFrame(np.vstack(train),columns=col)
train["date_block_num"]=train["date_block_num"].astype(np.int8)
train["shop_id"]=train["shop_id"].astype(np.int8)
train["item_id"]=train["item_id"].astype(np.int16)
print(train.head())

In [None]:
month_df=sales_train_df[["date_block_num","shop_id","item_id","item_cnt_day"]].groupby(["date_block_num","shop_id","item_id"]).sum()
month_df.head()

In [None]:
date_block_total=month_df.reset_index()
date_block_total_group=date_block_total[["date_block_num","item_cnt_day"]].groupby("date_block_num",as_index=False).sum()
plt.figure(figsize=(20,6))
sns.barplot(x="date_block_num",y="item_cnt_day",data=date_block_total_group)

In [None]:
date_block_total["year"]=date_block_total.date_block_num.apply(lambda x:((x//12)+2013))
date_block_total["month"]=date_block_total.date_block_num.apply(lambda x: (x%12))
date_block_total.head()

In [None]:
total_month=date_block_total[["month","item_cnt_day"]].groupby("month",as_index=False).sum()
plt.figure(figsize=(10,10))
sns.barplot(x="month",y="item_cnt_day",data=total_month)

In [None]:
total_year=date_block_total[["year","item_cnt_day"]].groupby("year",as_index=False).sum()
plt.figure(figsize=(6,6))
sns.barplot(x="year",y="item_cnt_day",data=total_year)

From this graph, an upward seasonal trend is visible from the first months of the year to the last. And the downward seasonal trend of sales by years from 2013 to 2015.

In [None]:
train["year"]=train.date_block_num.apply(lambda x:((x//12)+2013))
train["month"]=train.date_block_num.apply(lambda x: (x%12))
train["year"]=train["year"].astype(np.int16)
train["month"]=train["month"].astype(np.int8)
train.head()

In [None]:
test_df=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
test_df["date_block_num"]=34
test_df["year"]=((34//12)+2013)
test_df["month"]=(34%12)
test_df["date_block_num"]=test_df["date_block_num"].astype(np.int8)
test_df["shop_id"]=test_df["shop_id"].astype(np.int8)
test_df["item_id"]=test_df["item_id"].astype(np.int16)
test_df["year"]=test_df["year"].astype(np.int16)
test_df["month"]=test_df["month"].astype(np.int8)
test_df.head()

In [None]:
categorical=["year","month"]

Create Target Variable

In [None]:
train=train.join(month_df,on=["date_block_num","shop_id","item_id"],how="left")
train["item_cnt_day"]=train["item_cnt_day"].fillna(0).clip(0,20).astype(np.float16)
train.head()

In [None]:
train=pd.concat([train,test_df],ignore_index=True,sort=False,keys=col)
train.fillna(0,inplace=True)
train.head()

Create Lag Features For Total Month Cnt

In [None]:
def create_lag_features(df,feature):
  numeric_features=[]
  for i in range(1,7):
    lagged=df.copy()
    lagged=lagged[["date_block_num","shop_id","item_id",feature]]
    lagged.columns=["date_block_num","shop_id","item_id",feature+"_lag_"+str(i)]
    numeric_features.append(feature+"_lag_"+str(i))
    lagged["date_block_num"]+=i
    df=df.join(lagged.set_index(["date_block_num","shop_id","item_id"]),on=["date_block_num","shop_id","item_id"],how="left")
  return df,numeric_features

In [None]:
%%time
train,n_col=create_lag_features(train,"item_cnt_day")
train=train.fillna(0)
numeric=n_col

In [None]:
train.tail().T

Create Lag Features For Item Id Total Cnt

In [None]:
total_cnt_for_items=sales_train_df[["date_block_num","item_id","item_cnt_day"]].groupby(["date_block_num","item_id"]).sum()
total_cnt_for_items.columns=["total_cnt_for_items"]
total_cnt_for_items.head()

In [None]:
train=train.join(total_cnt_for_items,on=["date_block_num","item_id"],how="left")
train.fillna(0,inplace=True)
train["total_cnt_for_items"]=train["total_cnt_for_items"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"total_cnt_for_items")
train=train.fillna(0)
train=train.drop(["total_cnt_for_items"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

Create Features For shop_id.

In [None]:
total_cnt_for_shops=sales_train_df[["date_block_num","shop_id","item_cnt_day"]].groupby(["date_block_num","shop_id"]).sum()
total_cnt_for_shops.columns=["total_cnt_for_shops"]
total_cnt_for_shops.head()

In [None]:
sum_cnt_shops=total_cnt_for_shops.reset_index()
cnt_shops=sum_cnt_shops[["shop_id","total_cnt_for_shops"]].groupby("shop_id",as_index=False).mean()
cnt_shops=cnt_shops.join(shops_df.set_index(["shop_id"]),on="shop_id",how="left")
cnt_shops.head()

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_shops",y="shop_name",orient="h",data=cnt_shops)

In [None]:
cnt_shops.loc[cnt_shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
cnt_shops['shop_city'] = cnt_shops['shop_name'].str.split(' ').map(lambda x: x[0])
cnt_shops.loc[cnt_shops.shop_city == '!Якутск', 'shop_city'] = 'Якутск'
cnt_shops['shop_type'] = cnt_shops['shop_name'].str.split(' ').map(lambda x: x[1])
cnt_shops.head()


In [None]:
sum_cnt_shop_city=cnt_shops[["shop_city","total_cnt_for_shops"]].groupby("shop_city",as_index=False).sum()
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_shops",y="shop_city",orient="h",data=sum_cnt_shop_city)

In [None]:
sum_cnt_shop_type=cnt_shops[["shop_type","total_cnt_for_shops"]].groupby("shop_type",as_index=False).mean()
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_shops",y="shop_type",orient="h",data=sum_cnt_shop_type)

In [None]:
train=train.join(cnt_shops[["shop_id","shop_city","shop_type"]].set_index(["shop_id"]),on="shop_id",how="left")
train.fillna(0,inplace=True)
train["shop_city"]=train["shop_city"].astype(str)
train["shop_type"]=train["shop_type"].astype(str)
train.head().T

In [None]:
categorical=categorical+["shop_city","shop_type"]

Create Lag Features For shop_id.

In [None]:
train=train.join(total_cnt_for_shops,on=["date_block_num","shop_id"],how="left")
train.fillna(0,inplace=True)
train["total_cnt_for_shops"]=train["total_cnt_for_shops"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"total_cnt_for_shops")
train=train.fillna(0)
train=train.drop(["total_cnt_for_shops"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

Create Features For item_category_id.

In [None]:
total_cnt_for_item_categories=sales_train_df[["date_block_num","item_category_id","item_cnt_day"]].groupby(["date_block_num","item_category_id"]).sum()
total_cnt_for_item_categories.columns=["total_cnt_for_item_categories"]
total_cnt_for_item_categories.head()

In [None]:
sum_cnt_for_item_category=total_cnt_for_item_categories.reset_index()
sum_cnt_for_item_category=sum_cnt_for_item_category[["item_category_id","total_cnt_for_item_categories"]].groupby("item_category_id",as_index=False).sum()
sum_cnt_for_item_category=sum_cnt_for_item_category.join(item_categories_df.set_index("item_category_id"),on=["item_category_id"],how="left")
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_item_categories",y="item_category_name",orient="h",data=sum_cnt_for_item_category)

Two variable categories and a sibcategory can be extracted from the category name.

In [None]:
sum_cnt_for_item_category["category"]=sum_cnt_for_item_category.item_category_name.str.split("-").str[0]
sum_cnt_for_item_category["subcategory"]=sum_cnt_for_item_category.item_category_name.str.split("-").str[1]
sum_cnt_for_item_category.head()

In [None]:
sum_cnt_category=sum_cnt_for_item_category[["category","total_cnt_for_item_categories"]].groupby("category",as_index=False).sum()
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_item_categories",y="category",orient="h",data=sum_cnt_category)

In [None]:
sum_cnt_subcategory=sum_cnt_for_item_category[["subcategory","total_cnt_for_item_categories"]].groupby("subcategory",as_index=False).sum()
plt.figure(figsize=(15,15))
sns.barplot(x="total_cnt_for_item_categories",y="subcategory",orient="h",data=sum_cnt_subcategory)

In [None]:
train=train.join(items_df.set_index(["item_id"]),on=["item_id"],how="left")
train=train.join(sum_cnt_for_item_category[["item_category_id","category","subcategory"]].set_index(["item_category_id"]),on=["item_category_id"],how="left")
train.fillna(0,inplace=True)
train["category"]=train["category"].astype(str)
train["subcategory"]=train["subcategory"].astype(str)
train.tail().T

In [None]:
categorical=categorical+["category","subcategory"]

In [None]:
train=train.join(total_cnt_for_item_categories,on=["date_block_num","item_category_id"],how="left")
train.fillna(0,inplace=True)
train["total_cnt_for_item_categories"]=train["total_cnt_for_item_categories"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"total_cnt_for_item_categories")
train=train.fillna(0)
train=train.drop(["total_cnt_for_item_categories"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

In [None]:
train["item_name"]

Create Features For item_price

In [None]:
mean_price=sales_train_df[["date_block_num","shop_id","item_id","item_price"]].groupby(["date_block_num","shop_id","item_id"]).mean()
mean_price.columns=["mean_price"]
mean_price.head()

In [None]:
train=train.join(mean_price,on=["date_block_num","shop_id","item_id"],how="left")
train.fillna(0,inplace=True)
train["mean_price"]=train["mean_price"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"mean_price")
train=train.fillna(0)
train=train.drop(["mean_price"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

In [None]:
mean_price_for_items=sales_train_df[["date_block_num","item_id","item_price"]].groupby(["date_block_num","item_id"]).mean()
mean_price_for_items.columns=["mean_price_for_items"]
mean_price_for_items.head()

In [None]:
train=train.join(mean_price_for_items,on=["date_block_num","item_id"],how="left")
train.fillna(0,inplace=True)
train["mean_price_for_items"]=train["mean_price_for_items"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"mean_price_for_items")
train=train.fillna(0)
train=train.drop(["mean_price_for_items"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

In [None]:
mean_price_for_item_categories=sales_train_df[["date_block_num","item_category_id","item_price"]].groupby(["date_block_num","item_category_id"]).mean()
mean_price_for_item_categories.columns=["mean_price_for_item_categories"]
mean_price_for_item_categories.head()

In [None]:
train=train.join(mean_price_for_item_categories,on=["date_block_num","item_category_id"],how="left")
train.fillna(0,inplace=True)
train["mean_price_for_item_categories"]=train["mean_price_for_item_categories"].astype(np.float16)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"mean_price_for_item_categories")
train=train.fillna(0)
train=train.drop(["mean_price_for_item_categories"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

In [None]:
sales_train_df["revenue"]=sales_train_df["item_price"]*sales_train_df["item_cnt_day"]
total_revenue=sales_train_df[["date_block_num","shop_id","revenue"]].groupby(["date_block_num","shop_id"]).sum()
total_revenue.columns=["total_shop_revenue"]
total_revenue.head()

In [None]:
train=train.join(total_revenue,on=["date_block_num","shop_id"],how="left")
train["total_shop_revenue"]=train["total_shop_revenue"].astype(np.float32)
train.fillna(0,inplace=True)
train.head().T

In [None]:
%%time
train,n_col=create_lag_features(train,"total_shop_revenue")
train=train.fillna(0.0)
train=train.drop(["total_shop_revenue"],axis=1)
numeric=numeric+n_col

In [None]:
train.tail().T

In [None]:
test_df=train[train.date_block_num==34]
train=train[train.date_block_num<34]
train=train[~train["date_block_num"].isin([0,1,2,3,4,5])].reset_index(drop=True)
train.head().T

In [None]:
test_df.head().T

In [None]:
numeric=numeric+["date_block_num","item_cnt_day"]

In [None]:
'''groups = train.groupby(train.date_block_num).groups
sorted_groups = [value for (key, value) in sorted(groups.items())]
cv=[(np.concatenate(sorted_groups[:8]),np.concatenate(sorted_groups[8:])),
    (np.concatenate(sorted_groups[:16]),np.concatenate(sorted_groups[16:])),
    (np.concatenate(sorted_groups[:24]),np.concatenate(sorted_groups[24:]))]'''

In [None]:
X_train_categorical=train[categorical]
X_test_categorical=test_df[categorical]
X_train_categorical["subcategory"]=X_train_categorical["subcategory"].astype(str)
X_test_categorical["subcategory"]=X_test_categorical["subcategory"].astype(str)
X_test_categorical["year"]=X_test_categorical["year"].astype(np.int16)
X_test_categorical["month"]=X_test_categorical["month"].astype(np.int8)
X_test_categorical.loc[X_test_categorical.category=="PC ","category"]="Игры PC "
X_test_categorical.loc[X_test_categorical.subcategory==" Гарнитуры/Наушники","subcategory"]=" Аксессуары для игр"

In [None]:
for feature in categorical:
  le=LabelEncoder()
  print(feature)
  X_train_categorical[feature]=le.fit_transform(X_train_categorical[feature])
  X_test_categorical[feature]=le.transform(X_test_categorical[feature])

In [None]:
X_train_categorical.head()

In [None]:
X_test_categorical.head()

In [None]:
X_train_numeric=train[numeric]
X_test_numeric=test_df[numeric]

In [None]:
X_train_numeric.head().T

In [None]:
X_test_numeric.head().T

In [None]:
label_cat_not_num_train=pd.concat([X_train_categorical,X_train_numeric],axis=1)
label_cat_not_num_train.head().T

In [None]:
label_cat_not_num_test=pd.concat([X_test_categorical,X_test_numeric],axis=1)
label_cat_not_num_test.head().T

In [None]:
label_cat_not_num_train.shape

In [None]:
def downcast_type(df):
  for feature in categorical:
    df[feature]=df[feature].astype(np.int8)

In [None]:
downcast_type(label_cat_not_num_train)
label_cat_not_num_train.info()

In [None]:
X_train=label_cat_not_num_train[label_cat_not_num_train.date_block_num<=19]
X_val=label_cat_not_num_train[label_cat_not_num_train.date_block_num>19]
X_train=X_train.drop(["date_block_num"],axis=1)
X_val=X_val.drop(["date_block_num"],axis=1)

In [None]:
y_train=X_train["item_cnt_day"]
y_val=X_val["item_cnt_day"]
X_val=X_val.drop(["item_cnt_day"],axis=1)
X_train=X_train.drop(["item_cnt_day"],axis=1)

In [None]:
def RMSE(y,predictions):
  return  np.sqrt(mean_squared_error(y,predictions))
scorer=make_scorer(RMSE,False)

In [None]:
rf = RandomForestRegressor(n_estimators=50, max_depth=16,max_features=0.4,min_samples_leaf=10,n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
predictions = []
for tree in rf.estimators_:
    predictions.append(tree.predict(X_val))

In [None]:
predictions = np.vstack(predictions)

In [None]:
cum_mean = np.cumsum(predictions, axis=0)/np.arange(1, predictions.shape[0] + 1)[:, None]

In [None]:
cum_mean

In [None]:
scores = []
for pred in cum_mean:
    scores.append(np.sqrt(mean_squared_error(y_val,pred)))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(scores, linewidth=3)
plt.xlabel('num_trees')
plt.ylabel('RMSE');

In [None]:
pred=rf.predict(X_val)

In [None]:
score=np.sqrt(mean_squared_error(y_val,pred))
score

In [None]:
y_train=label_cat_not_num_train["item_cnt_day"]
label_cat_not_num_train=label_cat_not_num_train.drop(["item_cnt_day"],axis=1)
label_cat_not_num_train=label_cat_not_num_train.drop(["date_block_num"],axis=1)
label_cat_not_num_test=label_cat_not_num_test.drop(["item_cnt_day"],axis=1)
label_cat_not_num_test=label_cat_not_num_test.drop(["date_block_num"],axis=1)

In [None]:
rf = RandomForestRegressor(n_estimators=50, max_depth=16,max_features=0.4,min_samples_leaf=10,n_jobs=-1)
rf.fit(label_cat_not_num_train,y_train)

In [None]:
predictions=rf.predict(label_cat_not_num_test)

In [None]:
sample_submissions["item_cnt_month"]=predictions
sample_submissions.head()

In [None]:
sample_submissions.to_csv("random_forest_lagged_features_6.csv",index=False)