In [1]:
import numpy as np
import pandas as pd
from sklearn import *
import nltk, datetime

In [2]:
train = pd.read_csv('sales_train.csv')

In [3]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
train.shape

(2935849, 6)

In [5]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
items = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')

In [6]:
train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [7]:
test.shape

(214200, 3)

In [8]:
train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [9]:
# Date time prepare

train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

In [10]:
train = train.drop(['date', 'item_price'], axis=1)

In [11]:
train = train.groupby([c for c in train.columns if c not in ['item_cnt_day']], as_index=False)[['item_cnt_day']].sum()

In [12]:
train = train.rename(columns={'item_cnt_day':'item_cnt_month'})

In [13]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,month,year,item_cnt_month
0,0,0,32,1,2013,6.0
1,0,0,33,1,2013,3.0
2,0,0,35,1,2013,1.0
3,0,0,43,1,2013,1.0
4,0,0,51,1,2013,2.0


In [14]:
#Finding monthly mean

shop_item_mean = train[['shop_id', 'item_id', 'item_cnt_month']].groupby(['shop_id', 'item_id'], as_index=False)[['item_cnt_month']].mean()
shop_item_mean = shop_item_mean.rename(columns={'item_cnt_month':'item_cnt_month_mean'})

In [15]:
#Adding mean feature to our train set

train = pd.merge(train, shop_item_mean, how='left', on=['shop_id','item_id'])

In [16]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,month,year,item_cnt_month,item_cnt_month_mean
0,0,0,32,1,2013,6.0,8.0
1,0,0,33,1,2013,3.0,3.0
2,0,0,35,1,2013,1.0,7.5
3,0,0,43,1,2013,1.0,1.0
4,0,0,51,1,2013,2.0,2.5


In [17]:
#Adding last month

shop_prev_month = train[train['date_block_num']==33][['shop_id', 'item_id', 'item_cnt_month']]
shop_prev_month = shop_prev_month.rename(columns={'item_cnt_month':'item_cnt_prev_month'})
shop_prev_month.head()

Unnamed: 0,shop_id,item_id,item_cnt_prev_month
1577593,2,31,1.0
1577594,2,486,3.0
1577595,2,787,1.0
1577596,2,794,1.0
1577597,2,968,1.0


In [18]:
#Add previous month features to train dataset

train = pd.merge(train, shop_prev_month, how='left', on=['shop_id', 'item_id']).fillna(0.)

In [19]:
#Add all item features

train = pd.merge(train, items, how='left', on='item_id')

In [20]:
#Adding item features categories features

train = pd.merge(train, item_categories, how='left', on='item_category_id')

In [21]:
#Adding shop features

train = pd.merge(train, shops, how='left', on='shop_id')

train.head()

Unnamed: 0,date_block_num,shop_id,item_id,month,year,item_cnt_month,item_cnt_month_mean,item_cnt_prev_month,item_name,item_category_id,item_category_name,shop_name
0,0,0,32,1,2013,6.0,8.0,0.0,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран"
1,0,0,33,1,2013,3.0,3.0,0.0,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран"
2,0,0,35,1,2013,1.0,7.5,0.0,10 ЛЕТ СПУСТЯ,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран"
3,0,0,43,1,2013,1.0,1.0,0.0,100 МИЛЛИОНОВ ЕВРО,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран"
4,0,0,51,1,2013,2.0,2.5,0.0,100 лучших произведений классики (mp3-CD) (Dig...,57,Музыка - MP3,"!Якутск Орджоникидзе, 56 фран"


In [22]:
#Test dataset
#Addinf november 2015

test['month'] = 11
test['year'] = 2015
test['date_block_num'] = 34

In [23]:
#Adding mean features

test = pd.merge(test, shop_item_mean, how='left', on=['shop_id', 'item_id']).fillna(0.)

#Adding previous month features

test = pd.merge(test, shop_prev_month, how='left', on=['shop_id', 'item_id']).fillna(0.)

#Adding all item features

test = pd.merge(test, items, how='left', on='item_id')

#Adding item categories features

test = pd.merge(test, item_categories, how='left', on='item_category_id')

#Adding shop features

test = pd.merge(test, shops, how='left', on='shop_id')

test['item_cnt_month'] = 0.
test.head()

Unnamed: 0,ID,shop_id,item_id,month,year,date_block_num,item_cnt_month_mean,item_cnt_prev_month,item_name,item_category_id,item_category_name,shop_name,item_cnt_month
0,0,5,5037,11,2015,34,1.444444,0.0,"NHL 15 [PS3, русские субтитры]",19,Игры - PS3,"Вологда ТРЦ ""Мармелад""",0.0
1,1,5,5320,11,2015,34,0.0,0.0,ONE DIRECTION Made In The A.M.,55,Музыка - CD локального производства,"Вологда ТРЦ ""Мармелад""",0.0
2,2,5,5233,11,2015,34,2.0,1.0,"Need for Speed Rivals (Essentials) [PS3, русск...",19,Игры - PS3,"Вологда ТРЦ ""Мармелад""",0.0
3,3,5,5232,11,2015,34,1.0,0.0,"Need for Speed Rivals (Classics) [Xbox 360, ру...",23,Игры - XBOX 360,"Вологда ТРЦ ""Мармелад""",0.0
4,4,5,5268,11,2015,34,0.0,0.0,"Need for Speed [PS4, русская версия]",20,Игры - PS4,"Вологда ТРЦ ""Мармелад""",0.0


In [24]:
#Label encoding

for c in['shop_name', 'item_name', 'item_category_name']:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[c].unique())+list(test[c].unique()))
    train[c] = lbl.transform(train[c].astype(str))
    test[c] = lbl.transform(test[c].astype(str))
    print(c)

shop_name
item_name
item_category_name


In [27]:
#Train and predict usin Random Forest

col = [c for c in train.columns if c not in ['item_cnt_month']]
x1 = train[train['date_block_num']<33]
y1 = np.log1p(x1['item_cnt_month'].clip(0.,20.))
x1 = x1[col]
x2 = train[train['date_block_num']==33]
y2 = np.log1p(x2['item_cnt_month'].clip(0.,20.))
x2 = x2[col] 

In [32]:
reg = ensemble.ExtraTreesRegressor(n_estimators=40, n_jobs=-1, max_depth=15, random_state=18)
reg.fit(x1,y1)
print('RMSE value is :', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.), reg.predict(x2).clip(0.,20.))))

RMSE value is : 0.26779059402763405


In [33]:
reg.fit(train[col], train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = reg.predict(test[col]).clip(0.,20.)
test[['ID', 'item_cnt_month']].to_csv('Complete_csv', index=False)

In [34]:
test['item_cnt_month'] = np.expm1(test['item_cnt_month'])
test[['ID', 'item_cnt_month']].to_csv('final.csv', index=False)