# Aggregations

Compute monthly sales by suming daily sales, monthly price as average price during the month

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from matplotlib import pyplot
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
sales_train = pd.read_csv('data/sales_train_v2.csv')
items = pd.read_csv('data/items.csv')
shops = pd.read_csv('data/shops.csv')
item_categories = pd.read_csv('data/item_categories.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

Construct a grid of shop x items x months

In [3]:
grid = []
for block_num in sales_train['date_block_num'].unique():
    cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
grid.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,59,22154,0
1,59,2552,0
2,59,2554,0
3,59,2555,0
4,59,2564,0


Construct training set by computing aggreagations for each month, and merginig it with the grid

In [4]:
# Aggregations
sales_train['item_cnt_day'] = sales_train['item_cnt_day'].clip(0,20)
groups = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])
trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)

trainset = pd.merge(grid,trainset,how='left',on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)

# Get category id
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
trainset.to_csv('trainset_with_grid.csv')

trainset.head(5)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,,37
4,19,22154,0,0.0,,37


In [36]:
trainset.to_csv('trainset_with_grid.csv')

# Baseline model

In [10]:
baseline_features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_cnt_month']
train = trainset[baseline_features]

#train = train.set_index('shop_id')
train.item_cnt_month = train.item_cnt_month.astype(int)
train['item_cnt_month'] = train.item_cnt_month.fillna(0).clip(0,20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
train.head()

Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,item_cnt_month
0,59,22154,37,0,1
1,25,22154,37,0,5
2,24,22154,37,0,1
3,23,22154,37,0,0
4,19,22154,37,0,0


In [18]:
trainx = train[['shop_id', 'item_id', 'item_category_id', 'date_block_num']]
trainy = train['item_cnt_month']

test_dataset = pd.read_csv('data/test.csv', dtype=int)
test_df = pd.DataFrame(test_dataset, columns = ['shop_id', 'item_id'])

In [19]:
# Make test_dataset pandas data frame, add category id and date block num, then convert back to numpy array and predict
merged_test = pd.merge(test_df, items, on = ['item_id'])[['shop_id','item_id','item_category_id']]
merged_test['date_block_num'] = 33
#merged_test.set_index('shop_id')
merged_test.head(3)

model = xgb.XGBRegressor(max_depth = 10, min_child_weight=0.5, subsample = 1, eta = 0.3, num_round = 1000, seed = 1)
model.fit(trainx, trainy, eval_metric='rmse')
preds = model.predict(merged_test.values)

df = pd.DataFrame(preds, columns = ['item_cnt_month'])
df['ID'] = df.index
df = df.set_index('ID')
df.head()
#df.to_csv('simple_xgb.csv')

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




ValueError: feature_names mismatch: ['shop_id', 'item_id', 'item_category_id', 'date_block_num'] ['f0', 'f1', 'f2', 'f3']
expected date_block_num, item_id, shop_id, item_category_id in input data
training data did not have the following fields: f0, f1, f3, f2

In [32]:
df.to_csv('simple_xgb.csv')

We can try rounding the restults to integers. For now it did't improve score

In [35]:
df['item_cnt_month'].clip(0, 20)
df['item_cnt_month'] = df['item_cnt_month'].round(decimals=0)
df.to_csv('simple_xgb_round.csv')