In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from matplotlib import pyplot
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

Load preprocessed train set and all other avaiable files

In [2]:
new_features = []
trainset = pd.read_csv('trainset_with_grid.csv')
items = pd.read_csv('data/items.csv')
shops = pd.read_csv('data/shops.csv')

start_month = 0
end_month = 33
trainset = trainset[['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_price', 'item_cnt_month']]
trainset = trainset[(trainset.date_block_num >= start_month) & (trainset.date_block_num <= end_month)]

Load test set

In [3]:
test_dataset = pd.read_csv('data/test.csv', dtype=int)
testset = pd.DataFrame(test_dataset, columns = ['shop_id', 'item_id'])

In [4]:
testset = testset.merge(items[['item_id', 'item_category_id']], on = 'item_id', how = 'left')
testset['date_block_num'] = 34
# Make testset contains same column as trainset so we can concatenate them row-wise
testset['item_cnt_month'] = -1

train_test_set = pd.concat([trainset, testset], axis = 0) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Load item category information, clean it

In [5]:
item_cat = pd.read_csv('data/item_categories.csv')

# Fix category
l_cat = list(item_cat.item_category_name)
for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
    l_cat[ind] = 'Access'
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
    l_cat[ind] = 'Consoles'
for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'
l_cat[25] = 'Accessories for games'
for ind in range(26,28):
    l_cat[ind] = 'phone games'
for ind in range(28,32):
    l_cat[ind] = 'CD games'
for ind in range(32,37):
    l_cat[ind] = 'Card'
for ind in range(37,43):
    l_cat[ind] = 'Movie'
for ind in range(43,55):
    l_cat[ind] = 'Books'
for ind in range(55,61):
    l_cat[ind] = 'Music'
for ind in range(61,73):
    l_cat[ind] = 'Gifts'
for ind in range(73,79):
    l_cat[ind] = 'Soft'
for ind in range(79,81):
    l_cat[ind] = 'Office'
for ind in range(81,83):
    l_cat[ind] = 'Clean'
l_cat[83] = 'Elements of a food'

lb = preprocessing.LabelEncoder()
item_cat['item_category_id_fix'] = lb.fit_transform(l_cat)
item_cat['item_category_name_fix'] = l_cat
train_test_set = train_test_set.merge(item_cat[['item_category_id', 'item_category_id_fix']], on = 'item_category_id', how = 'left')
_ = train_test_set.drop(['item_category_id'],axis=1, inplace=True)
train_test_set.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)

_ = item_cat.drop(['item_category_id'],axis=1, inplace=True)
_ = item_cat.drop(['item_category_name'],axis=1, inplace=True)

item_cat.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)
item_cat.rename(columns = {'item_category_name_fix':'item_category_name'}, inplace = True)
item_cat = item_cat.drop_duplicates()
item_cat.index = np.arange(0, len(item_cat))

We will compute several lag features, for this to be done, we set loockback range

In [6]:
lookback_range = [1, 2, 3, 12]

In [7]:
for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_sales_' + str(diff)
    trainset2 = train_test_set.copy()
    trainset2.loc[:, 'date_block_num'] += diff
    trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    new_features.append(feature_name)

100%|██████████| 4/4 [00:35<00:00,  8.77s/it]


In [8]:
train_test_set.head(3)

Unnamed: 0,date_block_num,item_cnt_month,item_id,item_price,shop_id,item_category_id,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_12
0,0,1.0,22154,999.0,59,11,0.0,0.0,0.0,0.0
1,0,5.0,22154,999.0,25,11,0.0,0.0,0.0,0.0
2,0,1.0,22154,999.0,24,11,0.0,0.0,0.0,0.0


In [9]:
groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
for diff in tqdm(lookback_range):
    feature_name = 'prev_item_sales_' + str(diff)
    result = groups.agg({'item_cnt_month':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    new_features.append(feature_name)
    
train_test_set.head(3)

100%|██████████| 4/4 [00:11<00:00,  3.03s/it]


Unnamed: 0,date_block_num,item_cnt_month,item_id,item_price,shop_id,item_category_id,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_12,prev_item_sales_1,prev_item_sales_2,prev_item_sales_3,prev_item_sales_12
0,0,1.0,22154,999.0,59,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5.0,22154,999.0,25,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,22154,999.0,24,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
groups = train_test_set.groupby(by = ['shop_id', 'item_id', 'date_block_num'])
for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_price_' + str(diff)
    result = groups.agg({'item_price':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_price': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(result, on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name]
    new_features.append(feature_name) 
    
train_test_set.head(3)

100%|██████████| 4/4 [00:45<00:00, 12.02s/it]


Unnamed: 0,date_block_num,item_cnt_month,item_id,item_price,shop_id,item_category_id,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_12,prev_item_sales_1,prev_item_sales_2,prev_item_sales_3,prev_item_sales_12,prev_shopitem_price_1,prev_shopitem_price_2,prev_shopitem_price_3,prev_shopitem_price_12
0,0,1.0,22154,999.0,59,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1,0,5.0,22154,999.0,25,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,0,1.0,22154,999.0,24,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [11]:
groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
for diff in tqdm(lookback_range):
    feature_name = 'prev_item_price_' + str(diff)
    result = groups.agg({'item_price':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_price': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name]
    new_features.append(feature_name)        
    
train_test_set.head(3)

100%|██████████| 4/4 [00:14<00:00,  4.01s/it]


Unnamed: 0,date_block_num,item_cnt_month,item_id,item_price,shop_id,item_category_id,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_12,...,prev_item_sales_3,prev_item_sales_12,prev_shopitem_price_1,prev_shopitem_price_2,prev_shopitem_price_3,prev_shopitem_price_12,prev_item_price_1,prev_item_price_2,prev_item_price_3,prev_item_price_12
0,0,1.0,22154,999.0,59,11,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
1,0,5.0,22154,999.0,25,11,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
2,0,1.0,22154,999.0,24,11,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,


In [15]:
train_test_set.columns

Index(['date_block_num', 'item_cnt_month', 'item_id', 'item_price', 'shop_id',
       'item_category_id', 'prev_shopitem_sales_1', 'prev_shopitem_sales_2',
       'prev_shopitem_sales_3', 'prev_shopitem_sales_12', 'prev_item_sales_1',
       'prev_item_sales_2', 'prev_item_sales_3', 'prev_item_sales_12',
       'prev_shopitem_price_1', 'prev_shopitem_price_2',
       'prev_shopitem_price_3', 'prev_shopitem_price_12', 'prev_item_price_1',
       'prev_item_price_2', 'prev_item_price_3', 'prev_item_price_12'],
      dtype='object')

In [16]:
train_test_set['item_cnt_month'] = train_test_set['item_cnt_month'].astype(np.int16)
train_test_set['prev_shopitem_sales_1'] = train_test_set['prev_shopitem_sales_1'].astype(np.int16)
train_test_set['prev_shopitem_sales_2'] = train_test_set['prev_shopitem_sales_2'].astype(np.int16)
train_test_set['prev_shopitem_sales_3'] = train_test_set['prev_shopitem_sales_3'].astype(np.int16)
train_test_set['prev_shopitem_sales_12'] = train_test_set['prev_shopitem_sales_12'].astype(np.int16)
train_test_set['prev_item_sales_1'] = train_test_set['prev_item_sales_1'].astype(np.int16)
train_test_set['prev_item_sales_2'] = train_test_set['prev_item_sales_2'].astype(np.int16)
train_test_set['prev_item_sales_3'] = train_test_set['prev_item_sales_3'].astype(np.int16)
train_test_set['prev_item_sales_12'] = train_test_set['prev_item_sales_12'].astype(np.int16)

In [21]:
train_test_set.to_csv('data/train_test_set_basic_lags2.csv')

In [18]:
from preprocessing import downcast_dtypes

In [19]:
train_test_set = downcast_dtypes(train_test_set)

In [None]:
def create_last_sale_shop_item(row):
    for diff in range(1,33+1):
        feature_name = '_prev_shopitem_sales_' + str(diff)
        if row[feature_name] != 0.0:
            return diff
    return np.nan

lookback_range = list(range(1, 4))
for diff in tqdm(lookback_range):
    feature_name = '_prev_shopitem_sales_' + str(diff)
    trainset2 = train_test_set.copy()
    trainset2.loc[:, 'date_block_num'] += diff
    trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    #new_features.append(feature_name)

train_test_set.loc[:, 'last_sale_shop_item'] = train_test_set.progress_apply (lambda row: create_last_sale_shop_item(row),axis=1)
new_features.append('last_sale_shop_item')

 39%|███▉      | 13/33 [02:07<03:33, 10.67s/it]

In [None]:
def create_last_sale_item(row):
    for diff in range(1,33+1):
        feature_name = '_prev_item_sales_' + str(diff)
        if row[feature_name] != 0.0:
            return diff
    return np.nan
if kernel_with_output:
    lookback_range = list(range(1, 33 + 1))
    if enable_feature_idea[1]:
        groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
        for diff in tqdm(lookback_range):
            feature_name = '_prev_item_sales_' + str(diff)
            result = groups.agg({'item_cnt_month':'mean'})
            result = result.reset_index()
            result.loc[:, 'date_block_num'] += diff
            result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
            train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
            train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
            new_features.append(feature_name)        
    train_test_set.loc[:, 'last_sale_item'] = train_test_set.progress_apply (lambda row: create_last_sale_item(row),axis=1)