In [3]:
from datetime import datetime, timedelta, date
from calendar import monthrange
from pytz import timezone
import pandas as pd
import numpy as np
import feather
import datetime 
import pytz
import csv
import os
import seaborn as sns
import matplotlib.pyplot as plt
import logging

import xgboost as xgb
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from xgboost import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

kernel_with_output = False


from utils import *

%matplotlib inline

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [42]:
DATA_FOLDER = './data/'
ADD_RESOURCES = './res/'

sales_train    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz')) 
sample_submission = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv.gz'))


date_df = pd.read_pickle(os.path.join(ADD_RESOURCES, "russian_holidays.pkl"))
geography = pd.read_csv(os.path.join(ADD_RESOURCES, 'geography.csv'))
holidays = feather.read_dataframe(os.path.join(ADD_RESOURCES, 'hol.feather'))

In [35]:
# For every month we create a grid from all shops/items combinations from that month

grid = []
for block_num in sales_train.date_block_num.unique():
    cur_shops = sales_train[sales_train.date_block_num == block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train.date_block_num == block_num]['item_id'].unique()
    l = list( product(*[cur_shops, cur_items, [block_num]]) ) # cartesian product of shops, items with the block number
    grid.append(np.array(l , dtype='int32'))

index_cols = ['shop_id', 'item_id', 'date_block_num']
# np.vstack is used to unpack the rows from all np.arrays and put into a single array
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32) 

In [70]:
# Aggregations
sales_train['item_cnt_day'] = sales_train['item_cnt_day'].clip(0,20) # we say no-item sells more than 20 everyday
groups = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])
trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})

trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)

trainset = pd.merge(grid,trainset,how='left',on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)

# Get category id
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
trainset.to_csv('trainset_with_grid.csv')

trainset.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_category_id
0,59,22154,0,1.0,999.0,37
1,25,22154,0,5.0,999.0,37
2,24,22154,0,1.0,999.0,37
3,23,22154,0,0.0,,37
4,19,22154,0,0.0,,37


In [None]:
trainset.group