In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd

In [229]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [165]:
items['item_id'] = pd.to_numeric(items['item_id'],downcast='unsigned')
items['item_category_id'] = pd.to_numeric(items['item_category_id'],downcast='unsigned')

In [166]:
sales_train['date'] = sales_train['date'].astype('category')
sales_train['date_block_num'] = pd.to_numeric(sales_train['date_block_num'],downcast='unsigned')
sales_train['shop_id'] = pd.to_numeric(sales_train['shop_id'],downcast='unsigned')
sales_train['item_price'] = sales_train['item_price'].astype('int')
sales_train['item_price'] = pd.to_numeric(sales_train['item_price'],downcast='unsigned')
sales_train['item_cnt_day'] = pd.to_numeric(sales_train['item_cnt_day'],downcast='signed')


In [167]:
transactions = sales_train
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)
transactions['day'] = pd.to_numeric(transactions['day'],downcast='unsigned')
transactions['month'] = pd.to_numeric(transactions['month'],downcast='unsigned')
transactions['year'] = pd.to_numeric(transactions['year'],downcast='unsigned')


In [168]:
transactions = transactions.set_index('item_id').join(items.set_index('item_id'))
transactions.reset_index(inplace=True)
transactions = transactions[transactions['year'] != 2013]
transactions['y'] = pd.to_numeric(transactions.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].transform('sum')\
                .clip(0,20), downcast='unsigned')

In [169]:
len(transactions)

1668287

In [170]:
transactions['turnover'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['turnover'] = pd.to_numeric(transactions['turnover'], downcast='unsigned')

In [171]:
transactions['item_first_block'] = pd.to_numeric(transactions.groupby('item_id')['date_block_num'].transform(np.min), downcast='unsigned')
transactions['item_last_block'] = pd.to_numeric(transactions.groupby('item_id')['date_block_num'].transform(np.max), downcast='unsigned')

transactions['is_first_two_blocks'] = \
                    transactions['date_block_num'].isin([transactions['item_first_block']+1,transactions['item_first_block']+2])


transactions['is_last_two_blocks'] = \
                transactions['date_block_num'].isin([transactions['item_last_block']-1,transactions['item_last_block']])
                                

In [172]:
number_of_items = transactions['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = transactions['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = transactions['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = transactions['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = transactions['item_cnt_day'].sum()
print("total_sales:", total_sales)
total_turnover = transactions['turnover'].sum()
print("total_turnover:", total_turnover)
average_price = transactions['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473
total_turnover: 2181307117
average_price: 1015.4701882829513


#ITEM

-UNITS
item_units
item_block_units
item_mean_units_block
item_day_units
item_mean_units_day
item_max_units_block
item_min_units_block
item_max_units_day
item_min_units_day

-TURNOVER
item_turnover
item_block_turnover
item_mean_turnover_block
item_day_turnover
item_mean_turnover_day
item_max_turnover_block
item_min_turnover_block
item_max_turnover_day
item_min_turnover_day


-TIME
item_days_of_activity
item_blocks_of_activity
item_mean_day_between_activity
item_longest_stretch_days_without_activity
item_longest_stretch_blocks_without_activity
item_longest_stretch_block_with_activity
item_number_of_consecutive_days_with_activity
item_days_between_start_and_first_activity
item_blocks_between_start_and_first_activity
item_first_block
item_last_block
item_first_day
item_last_day
item_activity_on_all_blocks


-PRICE
item_mean_price
item_mean_price_block
item_min_price
item_max_price
item_number_different_prices
item_price_amplitude (%age min/max)
item_deviation_mean_category_price


-TREND
is_first_two_full_blocks (actually second/third to make sure we have a "full" block if this was a new release !!!!
is_last_two_blocks
item_first_two_blocks_units
item_last_two_blocks_units
item_fluctuation_units_first_last_blocks
item_first_two_blocks_mean_price
item_last_two_blocks_mean_price
item_fluctuation_price_first_last_blocks

-ENCODINGS
item_share_of_total_units
item_share_of_total_gross
item_share_of_category_units
item_share_of_category_turnover

In [173]:
gc.collect()
transactions_items = transactions.copy()
transactions_items_blocks = transactions.copy()

In [174]:
transactions_items_blocks['item_block_units'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')    
transactions_items_blocks['item_block_turnover'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')    
transactions_items_blocks['item_mean_price_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float')    

In [175]:
transactions_items['item_units'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.mean), downcast='float') 
transactions_items['item_day_units'] = pd.to_numeric(transactions_items.groupby(['item_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.mean), downcast='float') 
transactions_items['item_max_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.min), downcast='unsigned') 

In [176]:
transactions_items['item_turnover'] = pd.to_numeric(transactions_items.groupby(['item_id'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.mean), downcast='float') 
transactions_items['item_day_turnover'] = pd.to_numeric(transactions_items.groupby(['item_id','date'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['turnover'].transform(np.mean), downcast='float') 
transactions_items['item_max_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.min), downcast='unsigned') 

In [177]:
transactions_items['item_days_of_activity'] = pd.to_numeric(transactions_items.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
transactions_items['item_blocks_of_activity'] = pd.to_numeric(transactions_items.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

transactions_items['item_days_since_start'] = pd.to_numeric(transactions_items.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

transactions_items['item_mean_day_between_activity'] = pd.to_numeric(transactions_items['item_id'].map(average_days_between_sales), downcast='unsigned') 


def get_max_stretch_without_sales_days(days):
    days = np.unique(days)
    max_stretch = 0
    len_days = len(days)
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return max_stretch
        next_day = days[index+1]
        stretch = next_day - day
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales_day = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_max_stretch_without_sales_days(x))

transactions_items['item_longest_stretch_days_without_activity'] = pd.to_numeric(transactions_items['item_id'].map(max_stretch_without_sales_day), downcast='unsigned') 

In [178]:
gc.collect()

def get_max_stretch_without_sales_block(blocks):
    blocks = np.unique(blocks)
    max_stretch = 0
    len_blocks = len(blocks)
    for index,block in enumerate(sorted(blocks)):
        if index == len_blocks - 1:
            return max_stretch
        next_block = blocks[index+1]
        stretch = next_block - block
        if stretch > max_stretch:
            max_stretch = stretch
            

        
item_longest_stretch_blocks_without_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_block(x))

transactions_items['item_longest_stretch_blocks_without_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_longest_stretch_blocks_without_activity), downcast='unsigned') 



def get_longest_stretch(following_pairs, n=1,new_n=1):
    #print("following_pairs", following_pairs, " n: ", n, " new_n: ", new_n)
    len_pairs = len(following_pairs)
    if len_pairs == 0:
        return 0
    if len_pairs == 1:
        if new_n > n:
            return new_n
        return n
    if following_pairs[1][0] == following_pairs[0][1]:
        new_n+=1
    else:
        if new_n > n:
            n=new_n
        new_n=1
    return get_longest_stretch(following_pairs[1:], n,new_n)


assert(get_longest_stretch([]) == 0)
assert(get_longest_stretch([[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[-1, 0],[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[20, 21], [25,26]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16],[18,19] ,[22,23], [23,24],[24, 25]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5], [7, 8]]) == 1)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [20, 21], [21, 22], [22,23],[23,24]]) == 4)


def get_following_pairs(pairs):
    pairs = np.unique(pairs)
    len_pairs = len(pairs)
    following = []
    for index,pair in enumerate(sorted(pairs)):
        if index == len_pairs - 1:
            return following
        next_pair = pairs[index+1]
        if next_pair == pair + 1:
            following.append([pair, next_pair])
        
assert(get_following_pairs([1,2,5,6,7,8,9,11,12,15]) == [[1, 2], [5, 6], [6, 7], [7, 8], [8, 9], [11, 12]])
assert(get_following_pairs([1,2,5,6,7,10]) == [[1, 2], [5, 6], [6, 7]])
assert(get_following_pairs([1,2,4,5,7,9,10]) == [[1, 2], [4, 5], [9,10]])
assert(get_following_pairs([1,2,4,5,7,9,10,11,12,15]) == [[1, 2], [4, 5], [9,10],[10,11],[11,12]])


item_longest_stretch_block_with_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions_items['item_longest_stretch_block_with_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_longest_stretch_block_with_activity), downcast='unsigned') 


item_number_of_consecutive_days_with_activity = transactions_items.groupby(['item_id'])['item_days_since_start']\
                                    .apply(list).apply(lambda x: len(get_following_pairs(x)))
    
transactions_items['item_number_of_consecutive_days_with_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_number_of_consecutive_days_with_activity), downcast='unsigned') 

In [179]:
def get_units_between_first_and_last(units):
    return np.max(units) - np.min(units)

item_days_between_start_and_first_activity = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_days_between_start_and_first_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_days_between_start_and_first_activity), downcast='unsigned') 

item_blocks_between_start_and_first_activity = transactions_items.groupby(['item_id'])['date_block_num'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_blocks_between_start_and_first_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_blocks_between_start_and_first_activity), downcast='unsigned') 

In [180]:

transactions_items['item_first_day'] = pd.to_numeric(transactions_items.groupby('item_id')['item_days_since_start'].transform(np.min), downcast='unsigned') 
transactions_items['item_last_day'] = pd.to_numeric(transactions_items.groupby('item_id')['item_days_since_start'].transform(np.max), downcast='unsigned') 

item_activity_on_all_blocks = transactions_items.groupby('item_id')['date_block_num'].nunique().apply(lambda x: x==number_of_blocks)
transactions_items['item_activity_on_all_blocks'] = transactions_items['item_id'].map(item_activity_on_all_blocks)

In [181]:
transactions_items['item_mean_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.mean), downcast='float') 
transactions_items['item_min_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.max), downcast='unsigned') 
transactions_items['item_number_different_prices'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform('nunique'), downcast='unsigned') 
transactions_items['item_price_amplitude'] = pd.to_numeric(((transactions_items['item_max_price'] - transactions_items['item_min_price'] ) / transactions_items['item_min_price']) * 100, downcast='float') 
transactions_items['category_mean_price'] = pd.to_numeric(transactions_items.groupby('item_category_id')['item_price'].transform(np.mean), downcast='float') 
transactions_items['item_deviation_mean_category_price'] =  pd.to_numeric(((transactions_items['item_mean_price'] - transactions_items['category_mean_price'] ) / transactions_items['category_mean_price']) * 100, downcast='float') 

In [182]:
transactions_items['item_share_of_total_units'] = pd.to_numeric(transactions_items['item_units'] * 100 / total_sales , downcast='float') 

transactions_items['item_share_of_total_turnover'] = pd.to_numeric(transactions_items['item_turnover'] * 100 / total_turnover, downcast='float') 

transactions_items['category_units'] = pd.to_numeric(transactions_items.groupby('item_category_id')['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_share_of_category_units'] = pd.to_numeric(transactions_items['item_units'] * 100 / transactions_items['category_units'], downcast='float') 

transactions_items['category_turnover'] = pd.to_numeric(transactions_items.groupby('item_category_id')['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_share_of_category_turnover'] = pd.to_numeric(transactions_items['item_turnover'] * 100 / transactions_items['category_turnover'], downcast='float') 


In [183]:
item_first_two_blocks_units = transactions_items.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_first_two_blocks_units = item_first_two_blocks_units[item_first_two_blocks_units['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_first_two_blocks_units'] = pd.to_numeric(transactions_items['item_id'].map(item_first_two_blocks_units), downcast='unsigned') 

item_last_two_blocks_units = transactions_items.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_last_two_blocks_units = item_last_two_blocks_units[item_last_two_blocks_units['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_last_two_blocks_units'] = pd.to_numeric(transactions_items['item_id'].map(item_last_two_blocks_units), downcast='unsigned') 

transactions_items['item_fluctuation_units_first_last_blocks'] =  pd.to_numeric(\
transactions_items['item_first_two_blocks_units'] - transactions_items['item_last_two_blocks_units']\
                                                , downcast='signed') 


In [184]:
transactions_items.to_pickle("pickled/transactions_items")
transactions_items_blocks.to_pickle("pickled/transactions_items_blocks")

del transactions_items
del transactions_items_blocks
gc.collect()

252

#CATEGORY

-UNITS
category_units
category_block_units
category_mean_units_block
category_day_units
category_mean_units_day
category_max_units_block
category_min_units_block
category_max_units_day
category_min_units_day

-TURNOVER
category_turnover
category_block_turnover
category_mean_turnover_block
category_day_turnover
category_mean_turnover_day
category_max_turnover_block
category_min_turnover_block
category_max_turnover_day
category_min_turnover_day


-PRICE
category_mean_price
category_mean_price_block
category_min_price
category_max_price


-TREND
category_first_two_blocks_units
category_last_two_blocks_units
category_fluctuation_units_first_last_blocks
category_first_two_blocks_mean_price
category_last_two_blocks_mean_price
category_fluctuation_price_first_last_blocks

-SUBCATEGORY
subcategory
subcategory 1hot

-UNITS
subcategory_units
subcategory_block_units
subcategory_mean_units_block
subcategory_day_units
subcategory_mean_units_day
subcategory_max_units_block
subcategory_min_units_block
subcategory_max_units_day
subcategory_min_units_day

-TURNOVER
subcategory_turnover
subcategory_block_turnover
subcategory_mean_turnover_block
subcategory_day_turnover
subcategory_mean_turnover_day
subcategory_max_turnover_block
subcategory_min_turnover_block
subcategory_max_turnover_day
subcategory_min_turnover_day

-ENCODINGS
category_share_of_total_units
category_share_of_total_gross
subcategory_share_of_total_units
subcategory_share_of_total_gross

-TREND
subcategory_first_two_blocks_units
subcategory_last_two_blocks_units
subcategory_fluctuation_units_first_last_blocks
subcategory_first_two_blocks_mean_price
subcategory_last_two_blocks_mean_price
subcategory_fluctuation_price_first_last_blocks

In [185]:
gc.collect()
transactions_categories = transactions.copy()
transactions_categories_blocks = transactions.copy()

In [186]:
transactions_categories_blocks['category_block_units'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['category_block_turnover'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['category_mean_price_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 


In [187]:
transactions_categories['category_units'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.mean), downcast='float') 
transactions_categories['category_day_units'] = pd.to_numeric(transactions_categories.groupby(['item_category_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.mean), downcast='float') 
transactions_categories['category_max_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.min), downcast='unsigned') 
transactions_categories['category_max_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.min), downcast='unsigned') 

In [188]:
transactions_categories['category_turnover'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.mean), downcast='float') 
transactions_categories['category_day_turnover'] = pd.to_numeric(transactions_categories.groupby(['item_category_id','date'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.mean), downcast='float') 
transactions_categories['category_max_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.min), downcast='unsigned') 
transactions_categories['category_max_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.min), downcast='unsigned') 

In [189]:
transactions_categories['category_mean_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.mean), downcast='float') 
transactions_categories['category_min_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.min), downcast='unsigned')
transactions_categories['category_max_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.max), downcast='unsigned')

In [190]:
transactions_categories['category_share_of_total_units'] = pd.to_numeric(transactions_categories['category_units'] * 100 / total_sales , downcast='float') 
transactions_categories['category_share_of_total_turnover'] = pd.to_numeric(transactions_categories['category_turnover']* 100 / total_turnover, downcast='float') 



In [191]:

transactions_categories.to_pickle("pickled/transactions_categories")
transactions_categories_blocks.to_pickle("pickled/transactions_categories_blocks")

del transactions_categories
del transactions_categories_blocks
gc.collect()

70

#SHOP

-UNITS
shop_units
shop_block_units
shop_mean_units_block
shop_day_units
shop_mean_units_day
shop_max_units_block
shop_min_units_block
shop_max_units_day
shop_min_units_day

-TURNOVER
shop_turnover
shop_block_turnover
shop_mean_turnover_block
shop_day_turnover
shop_mean_turnover_day
shop_max_turnover_block
shop_min_turnover_block
shop_max_turnover_day
shop_min_turnover_day

-PRICE
shop_mean_price
shop_mean_price_block


-TREND
shop_first_two_blocks_units
shop_last_two_blocks_units
shop_fluctuation_units_first_last_blocks
shop_first_two_blocks_mean_price
shop_last_two_blocks_mean_price
shop_fluctuation_price_first_last_blocks

-ENCODINGS
shop_share_of_total_units
shop_share_of_total_gross

-MISC
shop_ids_TC
shop_ids_TRK
shop_ids_SEC
shop_ids_shopping_center
shop_ids_moscow

-CATEGORY
shop_top_category_units
shop_top_category_turnover
shop_top_subcategory_units
shop_top_subcategory_turnover

In [206]:
gc.collect()
transactions_shops = transactions.copy()
transactions_shops_blocks = transactions.copy()

In [207]:
transactions_shops_blocks['shop_block_units'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['shop_block_turnover'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['shop_mean_price_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 

In [208]:
transactions_shops['shop_units'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.mean), downcast='float') 
transactions_shops['shop_day_units'] = pd.to_numeric(transactions_shops.groupby(['shop_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.mean), downcast='float') 
transactions_shops['shop_max_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.min), downcast='unsigned')
transactions_shops['shop_max_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.min), downcast='unsigned')

In [209]:
transactions_shops['shop_turnover'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.mean), downcast='float')
transactions_shops['shop_day_turnover'] = pd.to_numeric(transactions_shops.groupby(['shop_id','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.mean), downcast='float')
transactions_shops['shop_max_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.min), downcast='unsigned')
transactions_shops['shop_max_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.min), downcast='unsigned')

In [210]:
transactions_shops['shop_mean_price'] = pd.to_numeric(transactions_shops.groupby('shop_id')['item_price'].transform(np.mean), downcast='float') 


In [211]:
transactions_shops['shop_share_of_units'] = pd.to_numeric(transactions_shops['shop_units'] * 100 / total_sales, downcast='float') 
transactions_shops['shop_share_of_turnover'] = pd.to_numeric(transactions_shops['shop_turnover'] * 100 / total_turnover, downcast='float') 

In [212]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()\
                  .groupby(['shop_id'])['item_cnt_day'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'item_cnt_day'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_units'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')

transactions_shops['max_category_units'] = pd.to_numeric(transactions_shops['max_category_units'], downcast='unsigned')

In [213]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()\
                  .groupby(['shop_id'])['turnover'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'turnover'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_turnover'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')
transactions_shops['max_category_turnover'] = pd.to_numeric(transactions_shops['max_category_turnover'], downcast='unsigned')

In [214]:
shop_first_two_blocks_units = transactions_shops.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_first_two_blocks_units = shop_first_two_blocks_units[shop_first_two_blocks_units['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_first_two_blocks_units'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_first_two_blocks_units), downcast='unsigned')

shop_last_two_blocks_units = transactions_shops.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_last_two_blocks_units = shop_last_two_blocks_units[shop_last_two_blocks_units['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_last_two_blocks_units'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_last_two_blocks_units), downcast='unsigned')

transactions_shops['shop_fluctuation_units_first_last_blocks'] =  pd.to_numeric(\
                transactions_shops['shop_first_two_blocks_units'] - transactions_shops['shop_last_two_blocks_units']\
                        , downcast='float') 

In [215]:

transactions_shops.to_pickle("pickled/transactions_shops")
transactions_shops_blocks.to_pickle("pickled/transactions_shops_blocks")


del transactions_shops
del transactions_shops_blocks
gc.collect()

266

shop_category


-UNITS
shop_category_units
shop_category_block_units
shop_category_mean_units_block
shop_category_day_units
shop_category_mean_units_day
shop_category_max_units_block
shop_category_min_units_block
shop_category_max_units_day
shop_category_min_units_day

-TURNOVER
shop_category_turnover
shop_category_block_turnover
shop_category_mean_turnover_block
shop_category_day_turnover
shop_category_mean_turnover_day
shop_category_max_turnover_block
shop_category_min_turnover_block
shop_category_max_turnover_day
shop_category_min_turnover_day

-PRICE
shop_category_mean_price
shop_category_mean_price_block


-TREND
shop_category_first_two_blocks_units
shop_category_last_two_blocks_units
shop_category_fluctuation_units_first_last_blocks
shop_category_first_two_blocks_mean_price
shop_category_last_two_blocks_mean_price
shop_category_fluctuation_price_first_last_blocks

-ENCODINGS
shop_category_share_of_total_units
shop_category_share_of_total_gross

In [216]:
gc.collect()
transactions_shops_categories = transactions.copy()
transactions_shops_categories_blocks = transactions.copy()

In [217]:
transactions_shops_categories_blocks['shop_category_block_units'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories_blocks['shop_category_block_turnover'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories_blocks['shop_category_mean_price_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 

In [218]:
transactions_shops_categories['shop_category_units'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_day_units'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_max_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.min), downcast='unsigned')
transactions_shops_categories['shop_category_max_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.min), downcast='unsigned')


In [219]:
transactions_shops_categories['shop_category_turnover'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_day_turnover'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_max_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.min), downcast='unsigned')
transactions_shops_categories['shop_category_max_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.min), downcast='unsigned')

In [220]:
transactions_shops_categories['shop_category_mean_price'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_price'].transform(np.mean), downcast='float') 


In [221]:

transactions_shops_categories.to_pickle("pickled/transactions_shops_categories")
transactions_shops_categories_blocks.to_pickle("pickled/transactions_shops_categories_blocks")


del transactions_shops_categories
del transactions_shops_categories_blocks
gc.collect()

91

shop_item


UNITS
shop_item_units
shop_item_day_units
shop_item_mean_units_day
shop_item_max_units_block
shop_item_min_units_block
shop_item_max_units_day
shop_item_min_units_day

-TURNOVER
shop_item_turnover
shop_item_day_turnover
shop_item_mean_turnover_day
shop_item_max_turnover_block
shop_item_min_turnover_block
shop_item_max_turnover_day
shop_item_min_turnover_day

-PRICE
shop_item_mean_price


-ENCODINGS
shop_item_share_of_total_units
shop_item_share_of_total_turnover
shop_item_share_of_shop_units
shop_item_share_of_shop_turnover


In [222]:
gc.collect()
transactions_shops_items = transactions.copy()
transactions_shops_items_blocks = transactions.copy()

In [223]:
transactions_shops_items['shop_item_units'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_item_day_units'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_item_mean_units_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_units'].transform(np.mean), downcast='float') 
transactions_shops_items['shop_item_max_units_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_units'].transform(np.max), downcast='unsigned')
transactions_shops_items['shop_item_min_units_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_units'].transform(np.min), downcast='unsigned')



In [224]:
transactions_shops_items['shop_item_turnover'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_item_day_turnover'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_item_mean_turnover_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_turnover'].transform(np.mean), downcast='float') 
transactions_shops_items['shop_item_max_turnover_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_turnover'].transform(np.max), downcast='unsigned') 
transactions_shops_items['shop_item_min_turnover_day'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['shop_item_day_turnover'].transform(np.min), downcast='unsigned')                                                                                        

In [225]:
transactions_shops_items['shop_item_mean_price'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id','item_id'])['item_price'].transform(np.mean), downcast='float') 


In [226]:
transactions_shops_items['shop_item_share_of_total_units'] = pd.to_numeric(transactions_shops_items['shop_item_units'] * 100 / total_sales, downcast='float') 
transactions_shops_items['shop_item_share_of_total_turnover'] = pd.to_numeric(transactions_shops_items['shop_item_turnover'] * 100 / total_turnover, downcast='float') 

transactions_shops_items['shop_units'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_turnover'] = pd.to_numeric(transactions_shops_items.groupby(['shop_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_items['shop_item_share_of_shop_units'] = pd.to_numeric(transactions_shops_items['shop_item_units'] * 100 / transactions_shops_items['shop_units'], downcast='float') 
transactions_shops_items['shop_item_share_of_shop_turnover'] = pd.to_numeric(transactions_shops_items['shop_item_turnover'] * 100 / transactions_shops_items['shop_turnover'], downcast='float') 
transactions_shops_items.drop(columns=['shop_units', 'shop_turnover'],inplace=True)

In [227]:

transactions_shops_items.to_pickle("pickled/transactions_shops_items")


del transactions_shops_items
gc.collect()

112

In [63]:
###
#DEBUG
###


pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
#transactions.sample(10).sort_values(by=['item_units'], ascending=False)
#transactions[transactions['item_category_id'] == 58].sample(10).sort_values(by=['total_sales_units'], ascending=False)

In [54]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [55]:
len(test_item_ids)

5100

In [56]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [57]:
len(combinations)

8333930

In [58]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [59]:
del combinations
gc.collect()

0

In [60]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num
0,0.0,54.0,12.0
1,0.0,54.0,13.0
2,0.0,54.0,14.0
3,0.0,54.0,15.0
4,0.0,54.0,16.0


In [61]:
all_combos['item_id'] = pd.to_numeric(all_combos['item_id'], downcast='unsigned')
all_combos['shop_id'] = pd.to_numeric(all_combos['shop_id'], downcast='unsigned')
all_combos['date_block_num'] = pd.to_numeric(all_combos['date_block_num'], downcast='unsigned')

In [62]:
len(all_combos)

8333930

In [63]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [64]:
dates = transactions[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
dates_dict

{20: {'month': 9, 'year': 2014},
 15: {'month': 4, 'year': 2014},
 18: {'month': 7, 'year': 2014},
 19: {'month': 8, 'year': 2014},
 21: {'month': 10, 'year': 2014},
 22: {'month': 11, 'year': 2014},
 23: {'month': 12, 'year': 2014},
 24: {'month': 1, 'year': 2015},
 27: {'month': 4, 'year': 2015},
 25: {'month': 2, 'year': 2015},
 12: {'month': 1, 'year': 2014},
 14: {'month': 3, 'year': 2014},
 16: {'month': 5, 'year': 2014},
 17: {'month': 6, 'year': 2014},
 13: {'month': 2, 'year': 2014},
 26: {'month': 3, 'year': 2015},
 28: {'month': 5, 'year': 2015},
 29: {'month': 6, 'year': 2015},
 30: {'month': 7, 'year': 2015},
 31: {'month': 8, 'year': 2015},
 32: {'month': 9, 'year': 2015},
 33: {'month': 10, 'year': 2015}}

In [65]:
all_combos['month'] = pd.to_numeric(all_combos['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')
all_combos['year'] = pd.to_numeric(all_combos['date_block_num'].apply(lambda block: dates_dict[block]['year']), downcast='unsigned')

In [66]:
def downcast(df, columns, dtypes):
    for column in columns:
        dtype = dtypes[column].kind
        if dtype in ['O', 'b']:
            continue  
        if dtype == 'u':
            df[column] = pd.to_numeric(df[column].astype(int), downcast='unsigned')
        elif dtype == 'i':
            df[column] = pd.to_numeric(df[column].astype(int), downcast='signed')
        else:
            df[column] = pd.to_numeric(df[column], downcast='float')

In [67]:
def fillnas(df, columns, dtypes):
    for column in columns:
        dtype = dtypes[column].kind
        if dtype in ['O', 'b']:
            continue  
        df[column].fillna(0, inplace=True)

In [68]:
training = all_combos
del all_combos
gc.collect()

35

In [69]:
transactions_items = pd.read_pickle("pickled/transactions_items")
transactions_items_columns = [c for c in transactions_items.columns.values\
                              if "item_" in c and c not in ['item_price', 'item_category_id', 'item_cnt_day']]
transactions_items_dtypes = transactions_items.dtypes
transactions_items.drop_duplicates('item_id', inplace=True)
training = pd.merge(training, transactions_items[transactions_items_columns], on=['item_id'], how='left', copy=False)

del transactions_items
fillnas(training, transactions_items_columns, transactions_items_dtypes)
downcast(training, transactions_items_columns, transactions_items_dtypes)
gc.collect()

35

In [70]:
transactions_items_blocks_columns =  [ 'date_block_num', 'item_id', 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block']

transactions_items_blocks = pd.read_pickle("pickled/transactions_items_blocks")
transactions_items_blocks_dtypes = transactions_items_blocks.dtypes

transactions_items_blocks.drop_duplicates(['item_id', 'date_block_num'], inplace=True)

training = pd.merge(training, transactions_items_blocks[transactions_items_blocks_columns]\
                    , on=['item_id','date_block_num'], how='left', copy=False)

del transactions_items_blocks
fillnas(training, transactions_items_blocks_columns, transactions_items_blocks_dtypes)
downcast(training, transactions_items_blocks_columns, transactions_items_blocks_dtypes)
gc.collect()

14

In [71]:
transactions_categories = pd.read_pickle("pickled/transactions_categories")
transactions_categories_columns = [c for c in transactions_categories.columns.values\
                              if "category_" in c ]

transactions_categories_dtypes = transactions_categories.dtypes
transactions_categories.drop_duplicates(['item_category_id'], inplace=True)

training = pd.merge(training, transactions_categories[transactions_categories_columns]\
                    , on=['item_category_id'], how='left', copy=False)

del transactions_categories
fillnas(training, transactions_categories_columns, transactions_categories_dtypes)
downcast(training, transactions_categories_columns, transactions_categories_dtypes)
gc.collect()

14

In [73]:
transactions_categories_blocks_columns = ['item_category_id', 'date_block_num', 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block']

transactions_categories_blocks = pd.read_pickle("pickled/transactions_categories_blocks")
transactions_categories_blocks_dtypes = transactions_categories_blocks.dtypes
transactions_categories_blocks.drop_duplicates(['item_category_id', 'date_block_num'], inplace=True)

training = pd.merge(training, transactions_categories_blocks[transactions_categories_blocks_columns]\
                    , on=['item_category_id', 'date_block_num'], how='left', copy=False)

del transactions_categories_blocks
fillnas(training, transactions_categories_blocks_columns, transactions_categories_blocks_dtypes)
downcast(training, transactions_categories_blocks_columns, transactions_categories_blocks_dtypes)
gc.collect()

72

In [74]:
transactions_shops = pd.read_pickle("pickled/transactions_shops")
transactions_shops_columns = [c for c in transactions_shops.columns.values\
                              if "shop_" in c ]

transactions_shops_dtypes = transactions_shops.dtypes
transactions_shops.drop_duplicates(['shop_id'], inplace=True)

training = pd.merge(training, transactions_shops[transactions_shops_columns]\
                    , on=['shop_id'], how='left', copy=False)

del transactions_shops
fillnas(training, transactions_shops_columns, transactions_shops_dtypes)
downcast(training, transactions_shops_columns, transactions_shops_dtypes)
gc.collect()

14

In [75]:
transactions_shops_blocks_columns = ['shop_id', 'date_block_num',  'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block', ]

transactions_shops_blocks = pd.read_pickle("pickled/transactions_shops_blocks")
transactions_shops_blocks_dtypes = transactions_shops_blocks.dtypes
transactions_shops_blocks.drop_duplicates(['shop_id', 'date_block_num'], inplace=True)

training = pd.merge(training, transactions_shops_blocks[transactions_shops_blocks_columns]\
                    , on=['shop_id', 'date_block_num'], how='left', copy=False)

del transactions_shops_blocks
fillnas(training, transactions_shops_blocks_columns, transactions_shops_blocks_dtypes)
downcast(training, transactions_shops_blocks_columns, transactions_shops_blocks_dtypes)
gc.collect()

14

In [76]:
transactions_shops_categories = pd.read_pickle("pickled/transactions_shops_categories")
transactions_shops_categories_columns = [c for c in transactions_shops_categories.columns.values\
                              if "shop_category" in c or c in ['shop_id', 'item_category_id']]
transactions_shops_categories_dtypes = transactions_shops_categories.dtypes
transactions_shops_categories.drop_duplicates(['shop_id', 'item_category_id'], inplace=True)

training = pd.merge(training, transactions_shops_categories[transactions_shops_categories_columns]\
                    , on=['shop_id','item_category_id'], how='left', copy=False)

del transactions_shops_categories
fillnas(training, transactions_shops_categories_columns, transactions_shops_categories_dtypes)
downcast(training, transactions_shops_categories_columns, transactions_shops_categories_dtypes)
gc.collect()

14

In [77]:
transactions_shops_categories_blocks_columns = ['shop_id', 'item_category_id', 'date_block_num',   'shop_category_block_units',
 'shop_category_block_turnover',
 'shop_category_mean_price_block']

transactions_shops_categories_blocks = pd.read_pickle("pickled/transactions_shops_categories_blocks")
transactions_shops_categories_blocks_dtypes = transactions_shops_categories_blocks.dtypes
transactions_shops_categories_blocks.drop_duplicates(['shop_id', 'item_category_id', 'date_block_num'], inplace=True)

training = pd.merge(training, transactions_shops_categories_blocks[transactions_shops_categories_blocks_columns]\
                    ,\
                    on=['shop_id','item_category_id','date_block_num'], how='left', copy=False)

del transactions_shops_categories_blocks
fillnas(training, transactions_shops_categories_blocks_columns, transactions_shops_categories_blocks_dtypes)
downcast(training, transactions_shops_categories_blocks_columns, transactions_shops_categories_blocks_dtypes)
gc.collect()

14

In [78]:
transactions_shops_items = pd.read_pickle("pickled/transactions_shops_items")
transactions_shops_items_columns = [c for c in transactions_shops_items.columns.values\
                              if "shop_item" in c or c in ['shop_id', 'item_id']]
transactions_shops_items_dtypes = transactions_shops_items.dtypes
transactions_shops_items.drop_duplicates(['shop_id', 'item_id'], inplace=True)

training = pd.merge(training, transactions_shops_items[transactions_shops_items_columns]\
                    , on=['shop_id','item_id'], how='left', copy=False)

del transactions_shops_items
fillnas(training, transactions_shops_items_columns, transactions_shops_items_dtypes)
downcast(training, transactions_shops_items_columns, transactions_shops_items_dtypes)
gc.collect()

14

In [79]:
len(training)

8333930

In [80]:
#del training
gc.collect()
#training.to_pickle("pickled/training_pre_lags")
#training = pd.read_pickle("pickled/training_pre_lags")

0

In [82]:
lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block',
 'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block',
 'shop_category_block_units',
 'shop_category_turnover',
 'shop_category_mean_price_block'
 ]

In [83]:
def downcast_lags(df, lagged_names):
    for lagged_name in lagged_names:
        df[lagged_name].fillna(0,inplace=True)    
    for column in lagged_names:
        if "mean" in column:
            df[column] = pd.to_numeric(df[column], downcast='float')
        else:
            df[column] = pd.to_numeric(df[column].astype(int), downcast='unsigned')
    return df

In [84]:
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)


In [85]:
lags = [1,2,3]
#lags = [1]


def add_lag_features(df, lag_columns, idx_columns):

    gc.collect()

    merge_columns = ['lagged_block'] + idx_columns

    for lag in lags:
        print(lag)
        lagged = df[['date_block_num'] + idx_columns + lag_columns].copy()
        lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
        df['lagged_block'] = df['date_block_num'] - lag
        lagged_names = [lagged_name(c,lag) for c in lag_columns]
        lag_mapping = dict(zip(lag_columns, lagged_names))
        lagged.rename(columns=lag_mapping,inplace=True)
        
        df.set_index(merge_columns, inplace=True)
        #lagged.drop(columns=lag_columns, inplace=True)
        lagged.drop_duplicates(lagged_names+merge_columns, inplace=True)
        lagged.set_index(merge_columns, inplace=True)
        
        df = pd.merge(df, lagged,on=merge_columns,how='left',copy=False)
        gc.collect()
        df.reset_index(inplace=True)
    
        df = downcast_lags(df, lagged_names)
        del lagged
        gc.collect()
        
    return df

In [86]:
gc.collect()


lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 ]

idx_columns = ['item_id']

training = add_lag_features(training,lag_columns,idx_columns)

1
2
3


In [87]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training[(training['item_id'] == 30) & (training['shop_id'] == 30)]\
                .drop_duplicates(['item_id', 'date_block_num'])[['item_id','shop_id','date_block_num','item_block_units','item_block_turnover',\
                'item_block_units_lag_1',
 'item_block_turnover_lag_1', 'item_mean_price_block_lag_1',
 'item_block_units_lag_2', 'item_block_turnover_lag_2',
 'item_mean_price_block_lag_2', 'item_block_units_lag_3',
 'item_block_turnover_lag_3' ,'item_mean_price_block_lag_3'
                                                                ]]

Unnamed: 0,item_id,shop_id,date_block_num,item_block_units,item_block_turnover,item_block_units_lag_1,item_block_turnover_lag_1,item_mean_price_block_lag_1,item_block_units_lag_2,item_block_turnover_lag_2,item_mean_price_block_lag_2,item_block_units_lag_3,item_block_turnover_lag_3,item_mean_price_block_lag_3
1210,30,30,12,58,9802,0,0,0.0,0,0,0.0,0,0,0.0
1211,30,30,13,24,3986,58,9802,169.0,0,0,0.0,0,0,0.0
1212,30,30,14,31,5239,24,3986,166.083328,58,9802,169.0,0,0,0.0
1213,30,30,15,21,3479,31,5239,169.0,24,3986,166.083328,58,9802,169.0
1214,30,30,16,16,2634,21,3479,165.666672,31,5239,169.0,24,3986,166.083328
1215,30,30,17,13,2197,16,2634,164.625,21,3479,165.666672,31,5239,169.0
1216,30,30,18,13,2127,13,2197,169.0,16,2634,164.625,21,3479,165.666672
1217,30,30,19,12,2028,13,2127,163.615387,13,2197,169.0,16,2634,164.625
1218,30,30,20,11,1859,12,2028,169.0,13,2127,163.615387,13,2197,169.0
1219,30,30,21,13,2197,11,1859,169.0,12,2028,169.0,13,2127,163.615387


In [88]:
gc.collect()


lag_columns = [
 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block',
 ]

idx_columns = ['item_category_id']

training = add_lag_features(training,lag_columns,idx_columns)

1
2
3


In [89]:
gc.collect()


lag_columns = [
 'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block',
 ]

idx_columns = ['shop_id']


training = add_lag_features(training,lag_columns,idx_columns)

1
2
3


In [90]:
#gc.collect()
#training.to_pickle("pickled/training_mid_lags")
#training = pd.read_pickle("pickled/training_mid_lags")

In [91]:
lag_columns = [
  'shop_category_block_units',
 'shop_category_turnover',
 'shop_category_mean_price_block'
 ]

idx_columns = ['shop_id','item_category_id']


training = add_lag_features(training,lag_columns,idx_columns)

1
2
3


In [92]:
training.drop(columns=['lagged_block'],inplace=True)

In [93]:
training.columns.values

array(['shop_id', 'item_category_id', 'item_id', 'date_block_num',
       'month', 'year', 'item_first_block', 'item_last_block',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item

In [95]:
gc.collect()
training.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8333930 entries, 0 to 8333929
Columns: 162 entries, shop_id to shop_category_mean_price_block_lag_3
dtypes: float32(55), int16(22), int32(23), int64(19), int8(2), object(1), uint16(22), uint32(8), uint64(2), uint8(8)
memory usage: 5.0 GB


In [96]:
cols = ['shop_id','item_id', 'date_block_num']
training.set_index(cols, inplace=True)
transactions.drop_duplicates(cols, inplace=True)
transactions.set_index(cols, inplace=True)

training = pd.merge(training, transactions['y'], on=cols, how='left', copy=False)

training.reset_index(inplace=True)
training['y'] = training['y'].fillna(0)

In [97]:
for m in range(1,13):
    training[str(m)] = training['month'] == m

In [98]:
gc.collect()
training.to_pickle("pickled/training_post_lags")
#training = pd.read_pickle("pickled/training_post_lags")

In [118]:
gc.collect()
val = training[training['date_block_num'] == 33]
print("val length", len(val))

unique_pairs_val = list(set(list(zip(val.shop_id, val.item_id))))
print("number of unique shop/item pairs in val", len(unique_pairs_val))
unique_pairs_val_ignore = unique_pairs_val[0:int(len(unique_pairs_val)/2)]


def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

val_pairs_ignore_dict = {}
for t in unique_pairs_val_ignore:
    val_pairs_ignore_dict[tuple2key(t)] = 1
 
    
training['val_ignore'] = (training['shop_id'].astype(str) + '_' +  training['item_id'].astype(str))\
                                    .apply(lambda x: x in val_pairs_ignore_dict)

val length 378815
number of unique shop/item pairs in val 378815
lol


In [120]:
len(training[training['val_ignore'] == True])

4166954

In [125]:
x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
y_train = x_train['y']

x_val = training[training['date_block_num'] == 33]
y_val = x_val['y']


In [126]:
del training
gc.collect()

1971

In [12]:
#x_train.to_pickle("pickled/x_train")
x_train = pd.read_pickle("pickled/x_train")
#y_train.to_pickle("pickled/y_train")
y_train = pd.read_pickle("pickled/y_train")
#x_val.to_pickle("pickled/x_val")
x_val = pd.read_pickle("pickled/x_val")
#y_val.to_pickle("pickled/y_val")
y_val = pd.read_pickle("pickled/y_val")

In [130]:
len(x_train)

3977568

In [128]:
x_train.columns.values

array(['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'month', 'year', 'item_first_block', 'item_last_block',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item

In [153]:
cb_features = [
       'month',  'item_first_block', 'item_last_block',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item_first_day',
       'item_last_day', 'item_activity_on_all_blocks', 'item_mean_price',
       'item_min_price', 'item_max_price', 'item_number_different_prices',
       'item_price_amplitude', 'item_deviation_mean_category_price',
       'item_share_of_total_units', 'item_share_of_total_turnover',
       'item_share_of_category_units', 'item_share_of_category_turnover',
       'item_first_two_blocks_units', 'item_last_two_blocks_units',
       'item_fluctuation_units_first_last_blocks', 
        'category_units',
       'category_mean_units_block', 'category_day_units',
       'category_mean_units_day', 'category_max_units_block',
       'category_min_units_block', 'category_max_units_day',
       'category_min_units_day', 'category_turnover',
       'category_mean_turnover_block', 'category_day_turnover',
       'category_mean_turnover_day', 'category_max_turnover_block',
       'category_min_turnover_block', 'category_max_turnover_day',
       'category_min_turnover_day', 'category_mean_price',
       'category_min_price', 'category_max_price',
       'category_share_of_total_units',
       'category_share_of_total_turnover',
       'shop_units', 'shop_mean_units_block', 'shop_day_units',
       'shop_mean_units_day', 'shop_max_units_block',
       'shop_min_units_block', 'shop_max_units_day', 'shop_min_units_day',
       'shop_first_two_blocks_units', 'shop_last_two_blocks_units',
       'shop_fluctuation_units_first_last_blocks', 
       'shop_category_units', 'shop_category_mean_units_block',
       'shop_category_day_units', 'shop_category_mean_units_day',
       'shop_category_max_units_block', 'shop_category_min_units_block',
       'shop_category_max_units_day', 'shop_category_min_units_day',
       'shop_category_turnover', 'shop_category_mean_turnover_block',
       'shop_category_day_turnover', 'shop_category_mean_turnover_day',
       'shop_category_max_turnover_block',
       'shop_category_min_turnover_block',
       'shop_category_max_turnover_day', 'shop_category_min_turnover_day',
       'shop_category_mean_price', 
       'shop_item_units', 'shop_item_day_units',
       'shop_item_mean_units_day', 'shop_item_max_units_day',
       'shop_item_min_units_day', 'shop_item_turnover',
       'shop_item_day_turnover', 'shop_item_mean_turnover_day',
       'shop_item_max_turnover_day', 'shop_item_min_turnover_day',
       'shop_item_mean_price', 'shop_item_share_of_total_units',
       'shop_item_share_of_total_turnover',
       'shop_item_share_of_shop_units',
       'shop_item_share_of_shop_turnover', 'item_block_units_lag_1',
       'item_block_turnover_lag_1', 'item_mean_price_block_lag_1',
       'item_block_units_lag_2', 'item_block_turnover_lag_2',
       'item_mean_price_block_lag_2', 'item_block_units_lag_3',
       'item_block_turnover_lag_3', 'item_mean_price_block_lag_3',
       'category_block_units_lag_1', 'category_block_turnover_lag_1',
       'category_mean_price_block_lag_1', 'category_block_units_lag_2',
       'category_block_turnover_lag_2', 'category_mean_price_block_lag_2',
       'category_block_units_lag_3', 'category_block_turnover_lag_3',
       'category_mean_price_block_lag_3', 'shop_block_units_lag_1',
       'shop_block_turnover_lag_1', 'shop_mean_price_block_lag_1',
       'shop_block_units_lag_2', 'shop_block_turnover_lag_2',
       'shop_mean_price_block_lag_2', 'shop_block_units_lag_3',
       'shop_block_turnover_lag_3', 'shop_mean_price_block_lag_3',
       'shop_category_block_units_lag_1', 'shop_category_turnover_lag_1',
       'shop_category_mean_price_block_lag_1',
       'shop_category_block_units_lag_2', 'shop_category_turnover_lag_2',
       'shop_category_mean_price_block_lag_2',
       'shop_category_block_units_lag_3', 'shop_category_turnover_lag_3',
       'shop_category_mean_price_block_lag_3', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', '10', '11', '12']


In [268]:
cb_model = CatBoostRegressor(iterations=70000,
                             learning_rate=0.001,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                             od_type = "Iter",
                             od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[cb_features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[cb_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 1.0706011	test: 0.9236358	best: 0.9236358 (0)	total: 25.9ms	remaining: 30m 13s
1:	learn: 1.0701656	test: 0.9233525	best: 0.9233525 (1)	total: 52.8ms	remaining: 30m 47s
2:	learn: 1.0697323	test: 0.9230716	best: 0.9230716 (2)	total: 78.3ms	remaining: 30m 27s
3:	learn: 1.0693000	test: 0.9227891	best: 0.9227891 (3)	total: 103ms	remaining: 30m 10s
4:	learn: 1.0688696	test: 0.9225083	best: 0.9225083 (4)	total: 128ms	remaining: 29m 55s
5:	learn: 1.0684390	test: 0.9222269	best: 0.9222269 (5)	total: 154ms	remaining: 29m 58s
6:	learn: 1.0680086	test: 0.9219474	best: 0.9219474 (6)	total: 179ms	remaining: 29m 46s
7:	learn: 1.0675786	test: 0.9216666	best: 0.9216666 (7)	total: 203ms	remaining: 29m 39s
8:	learn: 1.0671510	test: 0.9213874	best: 0.9213874 (8)	total: 228ms	remaining: 29m 34s
9:	learn: 1.0667227	test: 0.9211097	best: 0.9211097 (9)	total: 258ms	remaining: 30m 5s
10:	learn: 1.0662951	test: 0.9208323	best: 0.9208323 (10)	total: 283ms	remaining: 30m
11:	learn: 1.0658691	test: 0.920

97:	learn: 1.0315724	test: 0.8986866	best: 0.8986866 (97)	total: 2.42s	remaining: 28m 45s
98:	learn: 1.0311997	test: 0.8984558	best: 0.8984558 (98)	total: 2.44s	remaining: 28m 46s
99:	learn: 1.0308270	test: 0.8982230	best: 0.8982230 (99)	total: 2.47s	remaining: 28m 47s
100:	learn: 1.0304546	test: 0.8979908	best: 0.8979908 (100)	total: 2.49s	remaining: 28m 46s
101:	learn: 1.0300841	test: 0.8977586	best: 0.8977586 (101)	total: 2.52s	remaining: 28m 45s
102:	learn: 1.0297141	test: 0.8975280	best: 0.8975280 (102)	total: 2.54s	remaining: 28m 46s
103:	learn: 1.0293448	test: 0.8972968	best: 0.8972968 (103)	total: 2.57s	remaining: 28m 45s
104:	learn: 1.0289743	test: 0.8970663	best: 0.8970663 (104)	total: 2.59s	remaining: 28m 45s
105:	learn: 1.0286056	test: 0.8968361	best: 0.8968361 (105)	total: 2.62s	remaining: 28m 45s
106:	learn: 1.0282391	test: 0.8966095	best: 0.8966095 (106)	total: 2.64s	remaining: 28m 45s
107:	learn: 1.0278725	test: 0.8963823	best: 0.8963823 (107)	total: 2.67s	remaining: 28

187:	learn: 1.0002610	test: 0.8795283	best: 0.8795283 (187)	total: 4.61s	remaining: 28m 30s
188:	learn: 0.9999379	test: 0.8793334	best: 0.8793334 (188)	total: 4.63s	remaining: 28m 31s
189:	learn: 0.9996146	test: 0.8791390	best: 0.8791390 (189)	total: 4.66s	remaining: 28m 30s
190:	learn: 0.9992908	test: 0.8789451	best: 0.8789451 (190)	total: 4.68s	remaining: 28m 30s
191:	learn: 0.9989697	test: 0.8787525	best: 0.8787525 (191)	total: 4.7s	remaining: 28m 30s
192:	learn: 0.9986503	test: 0.8785556	best: 0.8785556 (192)	total: 4.73s	remaining: 28m 30s
193:	learn: 0.9983288	test: 0.8783635	best: 0.8783635 (193)	total: 4.75s	remaining: 28m 30s
194:	learn: 0.9980072	test: 0.8781718	best: 0.8781718 (194)	total: 4.78s	remaining: 28m 29s
195:	learn: 0.9976883	test: 0.8779825	best: 0.8779825 (195)	total: 4.8s	remaining: 28m 29s
196:	learn: 0.9973696	test: 0.8777932	best: 0.8777932 (196)	total: 4.83s	remaining: 28m 29s
197:	learn: 0.9970500	test: 0.8776024	best: 0.8776024 (197)	total: 4.85s	remaining

277:	learn: 0.9730276	test: 0.8635332	best: 0.8635332 (277)	total: 6.78s	remaining: 28m 19s
278:	learn: 0.9727453	test: 0.8633737	best: 0.8633737 (278)	total: 6.8s	remaining: 28m 19s
279:	learn: 0.9724654	test: 0.8632127	best: 0.8632127 (279)	total: 6.82s	remaining: 28m 19s
280:	learn: 0.9721836	test: 0.8630501	best: 0.8630501 (280)	total: 6.85s	remaining: 28m 19s
281:	learn: 0.9719022	test: 0.8628887	best: 0.8628887 (281)	total: 6.87s	remaining: 28m 18s
282:	learn: 0.9716229	test: 0.8627295	best: 0.8627295 (282)	total: 6.89s	remaining: 28m 18s
283:	learn: 0.9713448	test: 0.8625707	best: 0.8625707 (283)	total: 6.92s	remaining: 28m 18s
284:	learn: 0.9710657	test: 0.8624140	best: 0.8624140 (284)	total: 6.94s	remaining: 28m 18s
285:	learn: 0.9707906	test: 0.8622517	best: 0.8622517 (285)	total: 6.97s	remaining: 28m 18s
286:	learn: 0.9705113	test: 0.8620920	best: 0.8620920 (286)	total: 6.99s	remaining: 28m 18s
287:	learn: 0.9702320	test: 0.8619326	best: 0.8619326 (287)	total: 7.02s	remainin

367:	learn: 0.9493208	test: 0.8502554	best: 0.8502554 (367)	total: 8.93s	remaining: 28m 9s
368:	learn: 0.9490778	test: 0.8501220	best: 0.8501220 (368)	total: 8.95s	remaining: 28m 9s
369:	learn: 0.9488361	test: 0.8499857	best: 0.8499857 (369)	total: 8.98s	remaining: 28m 9s
370:	learn: 0.9485924	test: 0.8498510	best: 0.8498510 (370)	total: 9s	remaining: 28m 9s
371:	learn: 0.9483484	test: 0.8497166	best: 0.8497166 (371)	total: 9.03s	remaining: 28m 9s
372:	learn: 0.9481063	test: 0.8495837	best: 0.8495837 (372)	total: 9.05s	remaining: 28m 9s
373:	learn: 0.9478657	test: 0.8494512	best: 0.8494512 (373)	total: 9.07s	remaining: 28m 9s
374:	learn: 0.9476231	test: 0.8493185	best: 0.8493185 (374)	total: 9.1s	remaining: 28m 8s
375:	learn: 0.9473798	test: 0.8491856	best: 0.8491856 (375)	total: 9.12s	remaining: 28m 9s
376:	learn: 0.9471370	test: 0.8490551	best: 0.8490551 (376)	total: 9.14s	remaining: 28m 8s
377:	learn: 0.9468957	test: 0.8489223	best: 0.8489223 (377)	total: 9.17s	remaining: 28m 8s
378

466:	learn: 0.9268683	test: 0.8383154	best: 0.8383154 (466)	total: 11.3s	remaining: 28m 1s
467:	learn: 0.9266580	test: 0.8382046	best: 0.8382046 (467)	total: 11.3s	remaining: 28m 1s
468:	learn: 0.9264507	test: 0.8380993	best: 0.8380993 (468)	total: 11.3s	remaining: 28m 1s
469:	learn: 0.9262420	test: 0.8379908	best: 0.8379908 (469)	total: 11.4s	remaining: 28m 1s
470:	learn: 0.9260353	test: 0.8378830	best: 0.8378830 (470)	total: 11.4s	remaining: 28m 1s
471:	learn: 0.9258252	test: 0.8377772	best: 0.8377772 (471)	total: 11.4s	remaining: 28m 1s
472:	learn: 0.9256151	test: 0.8376728	best: 0.8376728 (472)	total: 11.4s	remaining: 28m 1s
473:	learn: 0.9254080	test: 0.8375709	best: 0.8375709 (473)	total: 11.5s	remaining: 28m
474:	learn: 0.9252018	test: 0.8374647	best: 0.8374647 (474)	total: 11.5s	remaining: 28m
475:	learn: 0.9249957	test: 0.8373576	best: 0.8373576 (475)	total: 11.5s	remaining: 28m
476:	learn: 0.9247914	test: 0.8372523	best: 0.8372523 (476)	total: 11.5s	remaining: 28m
477:	learn:

565:	learn: 0.9077148	test: 0.8287100	best: 0.8287100 (565)	total: 13.7s	remaining: 27m 57s
566:	learn: 0.9075342	test: 0.8286255	best: 0.8286255 (566)	total: 13.7s	remaining: 27m 57s
567:	learn: 0.9073577	test: 0.8285395	best: 0.8285395 (567)	total: 13.7s	remaining: 27m 57s
568:	learn: 0.9071806	test: 0.8284526	best: 0.8284526 (568)	total: 13.7s	remaining: 27m 57s
569:	learn: 0.9070051	test: 0.8283665	best: 0.8283665 (569)	total: 13.8s	remaining: 27m 57s
570:	learn: 0.9068282	test: 0.8282841	best: 0.8282841 (570)	total: 13.8s	remaining: 27m 57s
571:	learn: 0.9066509	test: 0.8282015	best: 0.8282015 (571)	total: 13.8s	remaining: 27m 57s
572:	learn: 0.9064741	test: 0.8281140	best: 0.8281140 (572)	total: 13.8s	remaining: 27m 57s
573:	learn: 0.9062977	test: 0.8280310	best: 0.8280310 (573)	total: 13.9s	remaining: 27m 57s
574:	learn: 0.9061200	test: 0.8279463	best: 0.8279463 (574)	total: 13.9s	remaining: 27m 57s
575:	learn: 0.9059420	test: 0.8278617	best: 0.8278617 (575)	total: 13.9s	remaini

655:	learn: 0.8927615	test: 0.8217293	best: 0.8217293 (655)	total: 15.8s	remaining: 27m 52s
656:	learn: 0.8926113	test: 0.8216636	best: 0.8216636 (656)	total: 15.8s	remaining: 27m 52s
657:	learn: 0.8924610	test: 0.8215943	best: 0.8215943 (657)	total: 15.9s	remaining: 27m 52s
658:	learn: 0.8923051	test: 0.8215244	best: 0.8215244 (658)	total: 15.9s	remaining: 27m 52s
659:	learn: 0.8921511	test: 0.8214541	best: 0.8214541 (659)	total: 15.9s	remaining: 27m 52s
660:	learn: 0.8919982	test: 0.8213827	best: 0.8213827 (660)	total: 15.9s	remaining: 27m 52s
661:	learn: 0.8918462	test: 0.8213128	best: 0.8213128 (661)	total: 16s	remaining: 27m 52s
662:	learn: 0.8916913	test: 0.8212437	best: 0.8212437 (662)	total: 16s	remaining: 27m 52s
663:	learn: 0.8915380	test: 0.8211724	best: 0.8211724 (663)	total: 16s	remaining: 27m 52s
664:	learn: 0.8913850	test: 0.8211019	best: 0.8211019 (664)	total: 16s	remaining: 27m 52s
665:	learn: 0.8912332	test: 0.8210337	best: 0.8210337 (665)	total: 16.1s	remaining: 27m 

745:	learn: 0.8798815	test: 0.8160960	best: 0.8160960 (745)	total: 18s	remaining: 27m 48s
746:	learn: 0.8797469	test: 0.8160365	best: 0.8160365 (746)	total: 18s	remaining: 27m 49s
747:	learn: 0.8796114	test: 0.8159809	best: 0.8159809 (747)	total: 18s	remaining: 27m 48s
748:	learn: 0.8794777	test: 0.8159270	best: 0.8159270 (748)	total: 18s	remaining: 27m 48s
749:	learn: 0.8793459	test: 0.8158747	best: 0.8158747 (749)	total: 18.1s	remaining: 27m 48s
750:	learn: 0.8792133	test: 0.8158186	best: 0.8158186 (750)	total: 18.1s	remaining: 27m 48s
751:	learn: 0.8790824	test: 0.8157599	best: 0.8157599 (751)	total: 18.1s	remaining: 27m 48s
752:	learn: 0.8789484	test: 0.8157021	best: 0.8157021 (752)	total: 18.1s	remaining: 27m 48s
753:	learn: 0.8788159	test: 0.8156469	best: 0.8156469 (753)	total: 18.2s	remaining: 27m 48s
754:	learn: 0.8786834	test: 0.8155951	best: 0.8155951 (754)	total: 18.2s	remaining: 27m 48s
755:	learn: 0.8785560	test: 0.8155408	best: 0.8155408 (755)	total: 18.2s	remaining: 27m 

835:	learn: 0.8687691	test: 0.8115942	best: 0.8115942 (835)	total: 20.1s	remaining: 27m 45s
836:	learn: 0.8686555	test: 0.8115497	best: 0.8115497 (836)	total: 20.2s	remaining: 27m 45s
837:	learn: 0.8685420	test: 0.8115024	best: 0.8115024 (837)	total: 20.2s	remaining: 27m 45s
838:	learn: 0.8684272	test: 0.8114580	best: 0.8114580 (838)	total: 20.2s	remaining: 27m 45s
839:	learn: 0.8683115	test: 0.8114110	best: 0.8114110 (839)	total: 20.2s	remaining: 27m 45s
840:	learn: 0.8681983	test: 0.8113683	best: 0.8113683 (840)	total: 20.3s	remaining: 27m 45s
841:	learn: 0.8680839	test: 0.8113244	best: 0.8113244 (841)	total: 20.3s	remaining: 27m 45s
842:	learn: 0.8679708	test: 0.8112828	best: 0.8112828 (842)	total: 20.3s	remaining: 27m 45s
843:	learn: 0.8678581	test: 0.8112385	best: 0.8112385 (843)	total: 20.3s	remaining: 27m 45s
844:	learn: 0.8677439	test: 0.8111936	best: 0.8111936 (844)	total: 20.3s	remaining: 27m 45s
845:	learn: 0.8676336	test: 0.8111529	best: 0.8111529 (845)	total: 20.4s	remaini

925:	learn: 0.8591793	test: 0.8080140	best: 0.8080140 (925)	total: 22.3s	remaining: 27m 42s
926:	learn: 0.8590786	test: 0.8079787	best: 0.8079787 (926)	total: 22.3s	remaining: 27m 42s
927:	learn: 0.8589822	test: 0.8079459	best: 0.8079459 (927)	total: 22.3s	remaining: 27m 42s
928:	learn: 0.8588818	test: 0.8079082	best: 0.8079082 (928)	total: 22.4s	remaining: 27m 42s
929:	learn: 0.8587842	test: 0.8078725	best: 0.8078725 (929)	total: 22.4s	remaining: 27m 42s
930:	learn: 0.8586863	test: 0.8078372	best: 0.8078372 (930)	total: 22.4s	remaining: 27m 42s
931:	learn: 0.8585896	test: 0.8078020	best: 0.8078020 (931)	total: 22.4s	remaining: 27m 42s
932:	learn: 0.8584937	test: 0.8077708	best: 0.8077708 (932)	total: 22.5s	remaining: 27m 42s
933:	learn: 0.8583950	test: 0.8077341	best: 0.8077341 (933)	total: 22.5s	remaining: 27m 42s
934:	learn: 0.8582986	test: 0.8077030	best: 0.8077030 (934)	total: 22.5s	remaining: 27m 42s
935:	learn: 0.8582004	test: 0.8076653	best: 0.8076653 (935)	total: 22.5s	remaini

1015:	learn: 0.8509238	test: 0.8051832	best: 0.8051832 (1015)	total: 24.4s	remaining: 27m 39s
1016:	learn: 0.8508379	test: 0.8051537	best: 0.8051537 (1016)	total: 24.5s	remaining: 27m 39s
1017:	learn: 0.8507512	test: 0.8051236	best: 0.8051236 (1017)	total: 24.5s	remaining: 27m 39s
1018:	learn: 0.8506655	test: 0.8050944	best: 0.8050944 (1018)	total: 24.5s	remaining: 27m 39s
1019:	learn: 0.8505815	test: 0.8050667	best: 0.8050667 (1019)	total: 24.5s	remaining: 27m 39s
1020:	learn: 0.8504965	test: 0.8050409	best: 0.8050409 (1020)	total: 24.6s	remaining: 27m 39s
1021:	learn: 0.8504132	test: 0.8050133	best: 0.8050133 (1021)	total: 24.6s	remaining: 27m 39s
1022:	learn: 0.8503313	test: 0.8049890	best: 0.8049890 (1022)	total: 24.6s	remaining: 27m 39s
1023:	learn: 0.8502492	test: 0.8049614	best: 0.8049614 (1023)	total: 24.6s	remaining: 27m 39s
1024:	learn: 0.8501629	test: 0.8049342	best: 0.8049342 (1024)	total: 24.7s	remaining: 27m 39s
1025:	learn: 0.8500784	test: 0.8049067	best: 0.8049067 (1025

1105:	learn: 0.8437975	test: 0.8029289	best: 0.8029289 (1105)	total: 26.6s	remaining: 27m 37s
1106:	learn: 0.8437231	test: 0.8029065	best: 0.8029065 (1106)	total: 26.6s	remaining: 27m 37s
1107:	learn: 0.8436482	test: 0.8028837	best: 0.8028837 (1107)	total: 26.7s	remaining: 27m 37s
1108:	learn: 0.8435733	test: 0.8028623	best: 0.8028623 (1108)	total: 26.7s	remaining: 27m 37s
1109:	learn: 0.8435016	test: 0.8028430	best: 0.8028430 (1109)	total: 26.7s	remaining: 27m 37s
1110:	learn: 0.8434296	test: 0.8028237	best: 0.8028237 (1110)	total: 26.7s	remaining: 27m 37s
1111:	learn: 0.8433563	test: 0.8028015	best: 0.8028015 (1111)	total: 26.8s	remaining: 27m 37s
1112:	learn: 0.8432848	test: 0.8027794	best: 0.8027794 (1112)	total: 26.8s	remaining: 27m 37s
1113:	learn: 0.8432114	test: 0.8027567	best: 0.8027567 (1113)	total: 26.8s	remaining: 27m 37s
1114:	learn: 0.8431384	test: 0.8027381	best: 0.8027381 (1114)	total: 26.8s	remaining: 27m 37s
1115:	learn: 0.8430676	test: 0.8027198	best: 0.8027198 (1115

1195:	learn: 0.8376550	test: 0.8011803	best: 0.8011803 (1195)	total: 28.8s	remaining: 27m 34s
1196:	learn: 0.8375921	test: 0.8011640	best: 0.8011640 (1196)	total: 28.8s	remaining: 27m 34s
1197:	learn: 0.8375285	test: 0.8011457	best: 0.8011457 (1197)	total: 28.8s	remaining: 27m 34s
1198:	learn: 0.8374666	test: 0.8011292	best: 0.8011292 (1198)	total: 28.8s	remaining: 27m 34s
1199:	learn: 0.8374052	test: 0.8011135	best: 0.8011135 (1199)	total: 28.9s	remaining: 27m 34s
1200:	learn: 0.8373430	test: 0.8010997	best: 0.8010997 (1200)	total: 28.9s	remaining: 27m 34s
1201:	learn: 0.8372791	test: 0.8010812	best: 0.8010812 (1201)	total: 28.9s	remaining: 27m 34s
1202:	learn: 0.8372146	test: 0.8010647	best: 0.8010647 (1202)	total: 28.9s	remaining: 27m 34s
1203:	learn: 0.8371515	test: 0.8010479	best: 0.8010479 (1203)	total: 29s	remaining: 27m 34s
1204:	learn: 0.8370892	test: 0.8010281	best: 0.8010281 (1204)	total: 29s	remaining: 27m 34s
1205:	learn: 0.8370273	test: 0.8010089	best: 0.8010089 (1205)	to

1285:	learn: 0.8323566	test: 0.7998108	best: 0.7998108 (1285)	total: 30.9s	remaining: 27m 32s
1286:	learn: 0.8323004	test: 0.7997963	best: 0.7997963 (1286)	total: 30.9s	remaining: 27m 32s
1287:	learn: 0.8322464	test: 0.7997818	best: 0.7997818 (1287)	total: 31s	remaining: 27m 32s
1288:	learn: 0.8321903	test: 0.7997675	best: 0.7997675 (1288)	total: 31s	remaining: 27m 32s
1289:	learn: 0.8321369	test: 0.7997564	best: 0.7997564 (1289)	total: 31s	remaining: 27m 32s
1290:	learn: 0.8320847	test: 0.7997424	best: 0.7997424 (1290)	total: 31s	remaining: 27m 32s
1291:	learn: 0.8320297	test: 0.7997273	best: 0.7997273 (1291)	total: 31.1s	remaining: 27m 32s
1292:	learn: 0.8319769	test: 0.7997154	best: 0.7997154 (1292)	total: 31.1s	remaining: 27m 32s
1293:	learn: 0.8319226	test: 0.7997025	best: 0.7997025 (1293)	total: 31.1s	remaining: 27m 32s
1294:	learn: 0.8318685	test: 0.7996901	best: 0.7996901 (1294)	total: 31.1s	remaining: 27m 32s
1295:	learn: 0.8318163	test: 0.7996784	best: 0.7996784 (1295)	total:

1375:	learn: 0.8277885	test: 0.7987777	best: 0.7987777 (1375)	total: 33.1s	remaining: 27m 29s
1376:	learn: 0.8277396	test: 0.7987669	best: 0.7987669 (1376)	total: 33.1s	remaining: 27m 29s
1377:	learn: 0.8276916	test: 0.7987537	best: 0.7987537 (1377)	total: 33.1s	remaining: 27m 29s
1378:	learn: 0.8276463	test: 0.7987463	best: 0.7987463 (1378)	total: 33.1s	remaining: 27m 29s
1379:	learn: 0.8275984	test: 0.7987374	best: 0.7987374 (1379)	total: 33.2s	remaining: 27m 29s
1380:	learn: 0.8275508	test: 0.7987257	best: 0.7987257 (1380)	total: 33.2s	remaining: 27m 29s
1381:	learn: 0.8275030	test: 0.7987148	best: 0.7987148 (1381)	total: 33.2s	remaining: 27m 29s
1382:	learn: 0.8274567	test: 0.7987060	best: 0.7987060 (1382)	total: 33.2s	remaining: 27m 29s
1383:	learn: 0.8274094	test: 0.7986956	best: 0.7986956 (1383)	total: 33.3s	remaining: 27m 29s
1384:	learn: 0.8273633	test: 0.7986868	best: 0.7986868 (1384)	total: 33.3s	remaining: 27m 29s
1385:	learn: 0.8273170	test: 0.7986755	best: 0.7986755 (1385

1465:	learn: 0.8238312	test: 0.7980018	best: 0.7980018 (1465)	total: 35.2s	remaining: 27m 27s
1466:	learn: 0.8237890	test: 0.7979923	best: 0.7979923 (1466)	total: 35.3s	remaining: 27m 27s
1467:	learn: 0.8237478	test: 0.7979854	best: 0.7979854 (1467)	total: 35.3s	remaining: 27m 27s
1468:	learn: 0.8237085	test: 0.7979787	best: 0.7979787 (1468)	total: 35.3s	remaining: 27m 27s
1469:	learn: 0.8236687	test: 0.7979718	best: 0.7979718 (1469)	total: 35.3s	remaining: 27m 27s
1470:	learn: 0.8236297	test: 0.7979651	best: 0.7979651 (1470)	total: 35.4s	remaining: 27m 27s
1471:	learn: 0.8235893	test: 0.7979575	best: 0.7979575 (1471)	total: 35.4s	remaining: 27m 27s
1472:	learn: 0.8235501	test: 0.7979532	best: 0.7979532 (1472)	total: 35.4s	remaining: 27m 27s
1473:	learn: 0.8235099	test: 0.7979476	best: 0.7979476 (1473)	total: 35.4s	remaining: 27m 27s
1474:	learn: 0.8234693	test: 0.7979375	best: 0.7979375 (1474)	total: 35.5s	remaining: 27m 27s
1475:	learn: 0.8234288	test: 0.7979300	best: 0.7979300 (1475

1555:	learn: 0.8204091	test: 0.7974344	best: 0.7974344 (1555)	total: 37.4s	remaining: 27m 25s
1556:	learn: 0.8203724	test: 0.7974299	best: 0.7974299 (1556)	total: 37.4s	remaining: 27m 25s
1557:	learn: 0.8203367	test: 0.7974261	best: 0.7974261 (1557)	total: 37.4s	remaining: 27m 25s
1558:	learn: 0.8203023	test: 0.7974215	best: 0.7974215 (1558)	total: 37.5s	remaining: 27m 25s
1559:	learn: 0.8202668	test: 0.7974157	best: 0.7974157 (1559)	total: 37.5s	remaining: 27m 25s
1560:	learn: 0.8202324	test: 0.7974106	best: 0.7974106 (1560)	total: 37.5s	remaining: 27m 25s
1561:	learn: 0.8201970	test: 0.7974039	best: 0.7974039 (1561)	total: 37.5s	remaining: 27m 25s
1562:	learn: 0.8201633	test: 0.7974001	best: 0.7974001 (1562)	total: 37.6s	remaining: 27m 24s
1563:	learn: 0.8201272	test: 0.7973927	best: 0.7973927 (1563)	total: 37.6s	remaining: 27m 24s
1564:	learn: 0.8200911	test: 0.7973886	best: 0.7973886 (1564)	total: 37.6s	remaining: 27m 24s
1565:	learn: 0.8200568	test: 0.7973837	best: 0.7973837 (1565

1645:	learn: 0.8174506	test: 0.7970271	best: 0.7970271 (1645)	total: 39.6s	remaining: 27m 22s
1646:	learn: 0.8174194	test: 0.7970251	best: 0.7970251 (1646)	total: 39.6s	remaining: 27m 22s
1647:	learn: 0.8173887	test: 0.7970216	best: 0.7970216 (1647)	total: 39.6s	remaining: 27m 22s
1648:	learn: 0.8173592	test: 0.7970196	best: 0.7970196 (1648)	total: 39.6s	remaining: 27m 22s
1649:	learn: 0.8173279	test: 0.7970163	best: 0.7970163 (1649)	total: 39.6s	remaining: 27m 22s
1650:	learn: 0.8172982	test: 0.7970124	best: 0.7970124 (1650)	total: 39.7s	remaining: 27m 22s
1651:	learn: 0.8172681	test: 0.7970096	best: 0.7970096 (1651)	total: 39.7s	remaining: 27m 22s
1652:	learn: 0.8172388	test: 0.7970063	best: 0.7970063 (1652)	total: 39.7s	remaining: 27m 22s
1653:	learn: 0.8172094	test: 0.7970028	best: 0.7970028 (1653)	total: 39.7s	remaining: 27m 22s
1654:	learn: 0.8171791	test: 0.7969981	best: 0.7969981 (1654)	total: 39.8s	remaining: 27m 22s
1655:	learn: 0.8171479	test: 0.7969938	best: 0.7969938 (1655

1735:	learn: 0.8148741	test: 0.7967470	best: 0.7967470 (1735)	total: 41.7s	remaining: 27m 20s
1736:	learn: 0.8148494	test: 0.7967498	best: 0.7967470 (1735)	total: 41.7s	remaining: 27m 20s
1737:	learn: 0.8148232	test: 0.7967490	best: 0.7967470 (1735)	total: 41.8s	remaining: 27m 20s
1738:	learn: 0.8147954	test: 0.7967474	best: 0.7967470 (1735)	total: 41.8s	remaining: 27m 20s
1739:	learn: 0.8147697	test: 0.7967451	best: 0.7967451 (1739)	total: 41.8s	remaining: 27m 20s
1740:	learn: 0.8147430	test: 0.7967419	best: 0.7967419 (1740)	total: 41.8s	remaining: 27m 20s
1741:	learn: 0.8147188	test: 0.7967421	best: 0.7967419 (1740)	total: 41.9s	remaining: 27m 20s
1742:	learn: 0.8146913	test: 0.7967424	best: 0.7967419 (1740)	total: 41.9s	remaining: 27m 20s
1743:	learn: 0.8146655	test: 0.7967410	best: 0.7967410 (1743)	total: 41.9s	remaining: 27m 20s
1744:	learn: 0.8146386	test: 0.7967370	best: 0.7967370 (1744)	total: 41.9s	remaining: 27m 19s
1745:	learn: 0.8146139	test: 0.7967371	best: 0.7967370 (1744

1825:	learn: 0.8126507	test: 0.7966039	best: 0.7966039 (1825)	total: 43.9s	remaining: 27m 17s
1826:	learn: 0.8126266	test: 0.7966020	best: 0.7966020 (1826)	total: 43.9s	remaining: 27m 17s
1827:	learn: 0.8126027	test: 0.7966007	best: 0.7966007 (1827)	total: 43.9s	remaining: 27m 17s
1828:	learn: 0.8125797	test: 0.7965996	best: 0.7965996 (1828)	total: 43.9s	remaining: 27m 17s
1829:	learn: 0.8125567	test: 0.7965978	best: 0.7965978 (1829)	total: 44s	remaining: 27m 17s
1830:	learn: 0.8125328	test: 0.7965965	best: 0.7965965 (1830)	total: 44s	remaining: 27m 17s
1831:	learn: 0.8125106	test: 0.7965942	best: 0.7965942 (1831)	total: 44s	remaining: 27m 17s
1832:	learn: 0.8124871	test: 0.7965925	best: 0.7965925 (1832)	total: 44s	remaining: 27m 17s
1833:	learn: 0.8124638	test: 0.7965890	best: 0.7965890 (1833)	total: 44.1s	remaining: 27m 17s
1834:	learn: 0.8124410	test: 0.7965863	best: 0.7965863 (1834)	total: 44.1s	remaining: 27m 17s
1835:	learn: 0.8124182	test: 0.7965863	best: 0.7965863 (1835)	total:

1915:	learn: 0.8106957	test: 0.7965213	best: 0.7965213 (1915)	total: 46s	remaining: 27m 15s
1916:	learn: 0.8106744	test: 0.7965210	best: 0.7965210 (1916)	total: 46s	remaining: 27m 15s
1917:	learn: 0.8106547	test: 0.7965214	best: 0.7965210 (1916)	total: 46.1s	remaining: 27m 15s
1918:	learn: 0.8106342	test: 0.7965216	best: 0.7965210 (1916)	total: 46.1s	remaining: 27m 15s
1919:	learn: 0.8106148	test: 0.7965193	best: 0.7965193 (1919)	total: 46.1s	remaining: 27m 15s
1920:	learn: 0.8105954	test: 0.7965155	best: 0.7965155 (1920)	total: 46.1s	remaining: 27m 15s
1921:	learn: 0.8105748	test: 0.7965156	best: 0.7965155 (1920)	total: 46.2s	remaining: 27m 15s
1922:	learn: 0.8105555	test: 0.7965143	best: 0.7965143 (1922)	total: 46.2s	remaining: 27m 14s
1923:	learn: 0.8105357	test: 0.7965141	best: 0.7965141 (1923)	total: 46.2s	remaining: 27m 14s
1924:	learn: 0.8105152	test: 0.7965150	best: 0.7965141 (1923)	total: 46.2s	remaining: 27m 14s
1925:	learn: 0.8104964	test: 0.7965130	best: 0.7965130 (1925)	to

2005:	learn: 0.8089953	test: 0.7965023	best: 0.7964947 (1983)	total: 48.2s	remaining: 27m 12s
2006:	learn: 0.8089781	test: 0.7965026	best: 0.7964947 (1983)	total: 48.2s	remaining: 27m 12s
2007:	learn: 0.8089594	test: 0.7965027	best: 0.7964947 (1983)	total: 48.2s	remaining: 27m 12s
2008:	learn: 0.8089418	test: 0.7965035	best: 0.7964947 (1983)	total: 48.2s	remaining: 27m 12s
2009:	learn: 0.8089249	test: 0.7965038	best: 0.7964947 (1983)	total: 48.3s	remaining: 27m 12s
2010:	learn: 0.8089078	test: 0.7965038	best: 0.7964947 (1983)	total: 48.3s	remaining: 27m 12s
2011:	learn: 0.8088895	test: 0.7965049	best: 0.7964947 (1983)	total: 48.3s	remaining: 27m 12s
2012:	learn: 0.8088715	test: 0.7965055	best: 0.7964947 (1983)	total: 48.3s	remaining: 27m 12s
2013:	learn: 0.8088540	test: 0.7965049	best: 0.7964947 (1983)	total: 48.4s	remaining: 27m 12s
bestTest = 0.7964947413
bestIteration = 1983
Shrink model to first 1984 iterations.


<catboost.core.CatBoostRegressor at 0x7fcd577790b8>

In [148]:
#training.to_pickle("pickled/training")
#training = pd.read_pickle("pickled/training")

pickle.dump(cb_model, open( "pickled/cb_model", "wb"), protocol=4)

#cb_model = pickle.load( open( "pickled/cb_model", "rb" ) )

In [265]:
scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[cb_features[i]] = score

In [267]:
cb_features = [item[0] for item in scores.items() if item[1] > 4]

In [266]:
sorted(scores.items(), key=lambda x: x[1])[::-1]

[('shop_item_units', 35.464789160537016),
 ('item_block_units_lag_1', 26.120678482795334),
 ('shop_item_share_of_total_units', 19.74595296711794),
 ('item_mean_price_block_lag_2', 13.5054245493137),
 ('item_first_day', 5.163154840236002)]

In [151]:
xg_features = [
       'month',  'item_first_block', 'item_last_block',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item_first_day',
       'item_last_day', 'item_mean_price',
       'item_min_price', 'item_max_price', 'item_number_different_prices',
       'item_price_amplitude', 'item_deviation_mean_category_price',
       'item_share_of_total_units', 'item_share_of_total_turnover',
       'item_share_of_category_units', 'item_share_of_category_turnover',
       'item_first_two_blocks_units', 'item_last_two_blocks_units',
       'item_fluctuation_units_first_last_blocks', 
       'category_units',
       'category_mean_units_block', 'category_day_units',
       'category_mean_units_day', 'category_max_units_block',
       'category_min_units_block', 'category_max_units_day',
       'category_min_units_day', 'category_turnover',
       'category_mean_turnover_block', 'category_day_turnover',
       'category_mean_turnover_day', 'category_max_turnover_block',
       'category_min_turnover_block', 'category_max_turnover_day',
       'category_min_turnover_day', 'category_mean_price',
       'category_min_price', 'category_max_price',
       'category_share_of_total_units',
       'category_share_of_total_turnover',
       'shop_units', 'shop_mean_units_block', 'shop_day_units',
       'shop_mean_units_day', 'shop_max_units_block',
       'shop_min_units_block', 'shop_max_units_day', 'shop_min_units_day',
       'shop_first_two_blocks_units', 'shop_last_two_blocks_units',
       'shop_fluctuation_units_first_last_blocks', 
       'shop_category_units', 'shop_category_mean_units_block',
       'shop_category_day_units', 'shop_category_mean_units_day',
       'shop_category_max_units_block', 'shop_category_min_units_block',
       'shop_category_max_units_day', 'shop_category_min_units_day',
       'shop_category_turnover', 'shop_category_mean_turnover_block',
       'shop_category_day_turnover', 'shop_category_mean_turnover_day',
       'shop_category_max_turnover_block',
       'shop_category_min_turnover_block',
       'shop_category_max_turnover_day', 'shop_category_min_turnover_day',
       'shop_category_mean_price', 
       'shop_item_units', 'shop_item_day_units',
       'shop_item_mean_units_day', 'shop_item_max_units_day',
       'shop_item_min_units_day', 'shop_item_turnover',
       'shop_item_day_turnover', 'shop_item_mean_turnover_day',
       'shop_item_max_turnover_day', 'shop_item_min_turnover_day',
       'shop_item_mean_price', 'shop_item_share_of_total_units',
       'shop_item_share_of_total_turnover',
       'shop_item_share_of_shop_units',
       'shop_item_share_of_shop_turnover', 'item_block_units_lag_1',
       'item_block_turnover_lag_1', 'item_mean_price_block_lag_1',
       'item_block_units_lag_2', 'item_block_turnover_lag_2',
       'item_mean_price_block_lag_2', 'item_block_units_lag_3',
       'item_block_turnover_lag_3', 'item_mean_price_block_lag_3',
       'category_block_units_lag_1', 'category_block_turnover_lag_1',
       'category_mean_price_block_lag_1', 'category_block_units_lag_2',
       'category_block_turnover_lag_2', 'category_mean_price_block_lag_2',
       'category_block_units_lag_3', 'category_block_turnover_lag_3',
       'category_mean_price_block_lag_3', 'shop_block_units_lag_1',
       'shop_block_turnover_lag_1', 'shop_mean_price_block_lag_1',
       'shop_block_units_lag_2', 'shop_block_turnover_lag_2',
       'shop_mean_price_block_lag_2', 'shop_block_units_lag_3',
       'shop_block_turnover_lag_3', 'shop_mean_price_block_lag_3',
       'shop_category_block_units_lag_1', 'shop_category_turnover_lag_1',
       'shop_category_mean_price_block_lag_1',
       'shop_category_block_units_lag_2', 'shop_category_turnover_lag_2',
       'shop_category_mean_price_block_lag_2',
       'shop_category_block_units_lag_3', 'shop_category_turnover_lag_3',
       'shop_category_mean_price_block_lag_3', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', '10', '11', '12']


In [137]:

gc.collect()
params =   {
    'objective' : 'gpu:reg:linear',
    'tree_method':'gpu_hist',
    #'gpu_id': 0,
    'learning_rate': 0.001, 
    #'gamma' : 0.3, 
    #'min_child_weight' : 3,
    #'nthread' : 16,
    #'max_depth' : 30,
    #'subsample' : 0.9, 
    #'colsample_bytree' : 0.5, 
    'seed':42, 
    'eval_metric' : "rmse",
    'num_boost_round' : 70000,
    #'n_estimators':999,
    #'max_leaves': 300
}


tr_data = xgb.DMatrix(x_train[xg_features], y_train)
va_data = xgb.DMatrix(x_val[xg_features], y_val)


watchlist = [(tr_data, 'train'), (va_data, 'valid')]

xg_model = xgb.train(params, tr_data, 70000, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=True)

[15:58:39] /workspace/src/objective/regression_obj.cu:153: gpu:reg:linear is now deprecated, use reg:linear instead.
[0]	train-rmse:1.08288	valid-rmse:0.969527
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[1]	train-rmse:1.08213	valid-rmse:0.968971
[2]	train-rmse:1.08138	valid-rmse:0.968417
[3]	train-rmse:1.08063	valid-rmse:0.96786
[4]	train-rmse:1.07988	valid-rmse:0.967304
[5]	train-rmse:1.07913	valid-rmse:0.966751
[6]	train-rmse:1.07838	valid-rmse:0.966198
[7]	train-rmse:1.07764	valid-rmse:0.965643
[8]	train-rmse:1.07689	valid-rmse:0.965094
[9]	train-rmse:1.07615	valid-rmse:0.964541
[10]	train-rmse:1.07541	valid-rmse:0.963991
[11]	train-rmse:1.07466	valid-rmse:0.963439
[12]	train-rmse:1.07392	valid-rmse:0.962891
[13]	train-rmse:1.07318	valid-rmse:0.962342
[14]	train-rmse:1.07244	valid-rmse:0.961794
[15]	train-rmse:1.07171	valid-rmse:0.96125
[16]	train-rmse:1.07097	valid-rmse:0.960704
[1

[179]	train-rmse:0.962266	valid-rmse:0.879244
[180]	train-rmse:0.961668	valid-rmse:0.878801
[181]	train-rmse:0.961066	valid-rmse:0.87833
[182]	train-rmse:0.96047	valid-rmse:0.877883
[183]	train-rmse:0.959869	valid-rmse:0.877426
[184]	train-rmse:0.959272	valid-rmse:0.876981
[185]	train-rmse:0.95868	valid-rmse:0.876528
[186]	train-rmse:0.958082	valid-rmse:0.876064
[187]	train-rmse:0.957486	valid-rmse:0.875634
[188]	train-rmse:0.956894	valid-rmse:0.875189
[189]	train-rmse:0.956297	valid-rmse:0.874727
[190]	train-rmse:0.955704	valid-rmse:0.8743
[191]	train-rmse:0.955109	valid-rmse:0.873839
[192]	train-rmse:0.954517	valid-rmse:0.873412
[193]	train-rmse:0.953926	valid-rmse:0.872987
[194]	train-rmse:0.953342	valid-rmse:0.872534
[195]	train-rmse:0.952749	valid-rmse:0.872074
[196]	train-rmse:0.95216	valid-rmse:0.871645
[197]	train-rmse:0.951569	valid-rmse:0.871183
[198]	train-rmse:0.950983	valid-rmse:0.870756
[199]	train-rmse:0.950393	valid-rmse:0.870301
[200]	train-rmse:0.949807	valid-rmse:0.8

[358]	train-rmse:0.865726	valid-rmse:0.807271
[359]	train-rmse:0.865243	valid-rmse:0.806918
[360]	train-rmse:0.864759	valid-rmse:0.806548
[361]	train-rmse:0.864278	valid-rmse:0.806195
[362]	train-rmse:0.8638	valid-rmse:0.805831
[363]	train-rmse:0.863319	valid-rmse:0.805482
[364]	train-rmse:0.862836	valid-rmse:0.80512
[365]	train-rmse:0.862359	valid-rmse:0.804753
[366]	train-rmse:0.861878	valid-rmse:0.804386
[367]	train-rmse:0.8614	valid-rmse:0.804016
[368]	train-rmse:0.860926	valid-rmse:0.803661
[369]	train-rmse:0.860449	valid-rmse:0.803306
[370]	train-rmse:0.859975	valid-rmse:0.802936
[371]	train-rmse:0.859504	valid-rmse:0.802599
[372]	train-rmse:0.859032	valid-rmse:0.802227
[373]	train-rmse:0.858559	valid-rmse:0.801857
[374]	train-rmse:0.858079	valid-rmse:0.801507
[375]	train-rmse:0.857606	valid-rmse:0.801162
[376]	train-rmse:0.857131	valid-rmse:0.80082
[377]	train-rmse:0.856658	valid-rmse:0.80048
[378]	train-rmse:0.85619	valid-rmse:0.800121
[379]	train-rmse:0.855718	valid-rmse:0.799

[538]	train-rmse:0.788185	valid-rmse:0.749968
[539]	train-rmse:0.787804	valid-rmse:0.749703
[540]	train-rmse:0.787422	valid-rmse:0.749379
[541]	train-rmse:0.787043	valid-rmse:0.749086
[542]	train-rmse:0.786664	valid-rmse:0.74882
[543]	train-rmse:0.78628	valid-rmse:0.748527
[544]	train-rmse:0.785901	valid-rmse:0.748251
[545]	train-rmse:0.78552	valid-rmse:0.74798
[546]	train-rmse:0.785144	valid-rmse:0.747687
[547]	train-rmse:0.784766	valid-rmse:0.747414
[548]	train-rmse:0.784387	valid-rmse:0.747134
[549]	train-rmse:0.78401	valid-rmse:0.746861
[550]	train-rmse:0.783635	valid-rmse:0.746571
[551]	train-rmse:0.78326	valid-rmse:0.746301
[552]	train-rmse:0.782883	valid-rmse:0.746033
[553]	train-rmse:0.782503	valid-rmse:0.745743
[554]	train-rmse:0.782129	valid-rmse:0.745472
[555]	train-rmse:0.781755	valid-rmse:0.74519
[556]	train-rmse:0.781383	valid-rmse:0.744933
[557]	train-rmse:0.78101	valid-rmse:0.744651
[558]	train-rmse:0.780633	valid-rmse:0.744378
[559]	train-rmse:0.780262	valid-rmse:0.744

[718]	train-rmse:0.726163	valid-rmse:0.703236
[719]	train-rmse:0.725847	valid-rmse:0.702989
[720]	train-rmse:0.72554	valid-rmse:0.702763
[721]	train-rmse:0.72524	valid-rmse:0.702543
[722]	train-rmse:0.724931	valid-rmse:0.7023
[723]	train-rmse:0.72463	valid-rmse:0.702065
[724]	train-rmse:0.724314	valid-rmse:0.701828
[725]	train-rmse:0.724009	valid-rmse:0.70155
[726]	train-rmse:0.723694	valid-rmse:0.701314
[727]	train-rmse:0.723396	valid-rmse:0.701096
[728]	train-rmse:0.723095	valid-rmse:0.70086
[729]	train-rmse:0.72278	valid-rmse:0.700625
[730]	train-rmse:0.722483	valid-rmse:0.700413
[731]	train-rmse:0.722171	valid-rmse:0.700173
[732]	train-rmse:0.721868	valid-rmse:0.699896
[733]	train-rmse:0.72157	valid-rmse:0.699669
[734]	train-rmse:0.721259	valid-rmse:0.699426
[735]	train-rmse:0.720959	valid-rmse:0.69917
[736]	train-rmse:0.720653	valid-rmse:0.698938
[737]	train-rmse:0.720343	valid-rmse:0.698703
[738]	train-rmse:0.720044	valid-rmse:0.698452
[739]	train-rmse:0.719735	valid-rmse:0.69820

[897]	train-rmse:0.676858	valid-rmse:0.664298
[898]	train-rmse:0.676613	valid-rmse:0.664108
[899]	train-rmse:0.676371	valid-rmse:0.663925
[900]	train-rmse:0.676118	valid-rmse:0.663718
[901]	train-rmse:0.675881	valid-rmse:0.663535
[902]	train-rmse:0.675644	valid-rmse:0.663356
[903]	train-rmse:0.675396	valid-rmse:0.663135
[904]	train-rmse:0.675146	valid-rmse:0.662941
[905]	train-rmse:0.674905	valid-rmse:0.662747
[906]	train-rmse:0.674657	valid-rmse:0.662556
[907]	train-rmse:0.674408	valid-rmse:0.662359
[908]	train-rmse:0.674172	valid-rmse:0.66218
[909]	train-rmse:0.673928	valid-rmse:0.661993
[910]	train-rmse:0.67368	valid-rmse:0.661769
[911]	train-rmse:0.673441	valid-rmse:0.661586
[912]	train-rmse:0.673209	valid-rmse:0.661408
[913]	train-rmse:0.672968	valid-rmse:0.661232
[914]	train-rmse:0.672718	valid-rmse:0.661032
[915]	train-rmse:0.672483	valid-rmse:0.66084
[916]	train-rmse:0.672235	valid-rmse:0.660603
[917]	train-rmse:0.671987	valid-rmse:0.660363
[918]	train-rmse:0.671754	valid-rmse:

[1075]	train-rmse:0.637697	valid-rmse:0.632318
[1076]	train-rmse:0.637501	valid-rmse:0.632161
[1077]	train-rmse:0.637304	valid-rmse:0.632001
[1078]	train-rmse:0.637101	valid-rmse:0.6318
[1079]	train-rmse:0.636914	valid-rmse:0.631655
[1080]	train-rmse:0.636723	valid-rmse:0.631477
[1081]	train-rmse:0.636533	valid-rmse:0.631329
[1082]	train-rmse:0.636337	valid-rmse:0.631178
[1083]	train-rmse:0.636151	valid-rmse:0.631034
[1084]	train-rmse:0.63595	valid-rmse:0.630829
[1085]	train-rmse:0.635748	valid-rmse:0.630635
[1086]	train-rmse:0.635558	valid-rmse:0.630459
[1087]	train-rmse:0.635364	valid-rmse:0.63027
[1088]	train-rmse:0.635173	valid-rmse:0.630114
[1089]	train-rmse:0.634975	valid-rmse:0.629942
[1090]	train-rmse:0.634779	valid-rmse:0.629792
[1091]	train-rmse:0.63459	valid-rmse:0.629617
[1092]	train-rmse:0.634397	valid-rmse:0.629457
[1093]	train-rmse:0.634215	valid-rmse:0.629304
[1094]	train-rmse:0.634021	valid-rmse:0.62916
[1095]	train-rmse:0.63383	valid-rmse:0.629011
[1096]	train-rmse:0.

[1251]	train-rmse:0.606872	valid-rmse:0.60521
[1252]	train-rmse:0.606718	valid-rmse:0.605047
[1253]	train-rmse:0.606561	valid-rmse:0.604912
[1254]	train-rmse:0.606402	valid-rmse:0.604792
[1255]	train-rmse:0.606242	valid-rmse:0.604623
[1256]	train-rmse:0.60609	valid-rmse:0.604485
[1257]	train-rmse:0.605931	valid-rmse:0.604362
[1258]	train-rmse:0.60578	valid-rmse:0.604222
[1259]	train-rmse:0.605622	valid-rmse:0.6041
[1260]	train-rmse:0.60547	valid-rmse:0.603939
[1261]	train-rmse:0.605319	valid-rmse:0.603827
[1262]	train-rmse:0.605158	valid-rmse:0.603657
[1263]	train-rmse:0.604998	valid-rmse:0.603514
[1264]	train-rmse:0.604839	valid-rmse:0.603366
[1265]	train-rmse:0.604688	valid-rmse:0.603205
[1266]	train-rmse:0.604534	valid-rmse:0.603059
[1267]	train-rmse:0.604379	valid-rmse:0.602927
[1268]	train-rmse:0.60423	valid-rmse:0.60279
[1269]	train-rmse:0.604072	valid-rmse:0.602648
[1270]	train-rmse:0.603923	valid-rmse:0.602528
[1271]	train-rmse:0.603766	valid-rmse:0.60238
[1272]	train-rmse:0.60

[1427]	train-rmse:0.581978	valid-rmse:0.583249
[1428]	train-rmse:0.581849	valid-rmse:0.583147
[1429]	train-rmse:0.581721	valid-rmse:0.583016
[1430]	train-rmse:0.581596	valid-rmse:0.582907
[1431]	train-rmse:0.581471	valid-rmse:0.582801
[1432]	train-rmse:0.581341	valid-rmse:0.582683
[1433]	train-rmse:0.581215	valid-rmse:0.582558
[1434]	train-rmse:0.58109	valid-rmse:0.582439
[1435]	train-rmse:0.580959	valid-rmse:0.582325
[1436]	train-rmse:0.580832	valid-rmse:0.582204
[1437]	train-rmse:0.580706	valid-rmse:0.5821
[1438]	train-rmse:0.58058	valid-rmse:0.58197
[1439]	train-rmse:0.580458	valid-rmse:0.581874
[1440]	train-rmse:0.580328	valid-rmse:0.581753
[1441]	train-rmse:0.580202	valid-rmse:0.581636
[1442]	train-rmse:0.580079	valid-rmse:0.581537
[1443]	train-rmse:0.579953	valid-rmse:0.58142
[1444]	train-rmse:0.579831	valid-rmse:0.581313
[1445]	train-rmse:0.579705	valid-rmse:0.581186
[1446]	train-rmse:0.57958	valid-rmse:0.581069
[1447]	train-rmse:0.579451	valid-rmse:0.580948
[1448]	train-rmse:0.

[1603]	train-rmse:0.561588	valid-rmse:0.564015
[1604]	train-rmse:0.561487	valid-rmse:0.563914
[1605]	train-rmse:0.561378	valid-rmse:0.563805
[1606]	train-rmse:0.561273	valid-rmse:0.563715
[1607]	train-rmse:0.561173	valid-rmse:0.563627
[1608]	train-rmse:0.561065	valid-rmse:0.563518
[1609]	train-rmse:0.560966	valid-rmse:0.563417
[1610]	train-rmse:0.560859	valid-rmse:0.563327
[1611]	train-rmse:0.560755	valid-rmse:0.563237
[1612]	train-rmse:0.560655	valid-rmse:0.56314
[1613]	train-rmse:0.560556	valid-rmse:0.563051
[1614]	train-rmse:0.560447	valid-rmse:0.562944
[1615]	train-rmse:0.560344	valid-rmse:0.562854
[1616]	train-rmse:0.560246	valid-rmse:0.562771
[1617]	train-rmse:0.560142	valid-rmse:0.562671
[1618]	train-rmse:0.560035	valid-rmse:0.562574
[1619]	train-rmse:0.559933	valid-rmse:0.562483
[1620]	train-rmse:0.559835	valid-rmse:0.562396
[1621]	train-rmse:0.559728	valid-rmse:0.56229
[1622]	train-rmse:0.55963	valid-rmse:0.562196
[1623]	train-rmse:0.559528	valid-rmse:0.562072
[1624]	train-rms

[1778]	train-rmse:0.54511	valid-rmse:0.547725
[1779]	train-rmse:0.545025	valid-rmse:0.547642
[1780]	train-rmse:0.544943	valid-rmse:0.547549
[1781]	train-rmse:0.544855	valid-rmse:0.547462
[1782]	train-rmse:0.54477	valid-rmse:0.54737
[1783]	train-rmse:0.544683	valid-rmse:0.547265
[1784]	train-rmse:0.544602	valid-rmse:0.547184
[1785]	train-rmse:0.544517	valid-rmse:0.547095
[1786]	train-rmse:0.544432	valid-rmse:0.547002
[1787]	train-rmse:0.544346	valid-rmse:0.546914
[1788]	train-rmse:0.544262	valid-rmse:0.546833
[1789]	train-rmse:0.544185	valid-rmse:0.546765
[1790]	train-rmse:0.544101	valid-rmse:0.546665
[1791]	train-rmse:0.544017	valid-rmse:0.546585
[1792]	train-rmse:0.543933	valid-rmse:0.546492
[1793]	train-rmse:0.543843	valid-rmse:0.546403
[1794]	train-rmse:0.543759	valid-rmse:0.546316
[1795]	train-rmse:0.543675	valid-rmse:0.546226
[1796]	train-rmse:0.543589	valid-rmse:0.546121
[1797]	train-rmse:0.543505	valid-rmse:0.54604
[1798]	train-rmse:0.543422	valid-rmse:0.545957
[1799]	train-rmse

[1954]	train-rmse:0.531595	valid-rmse:0.533408
[1955]	train-rmse:0.531523	valid-rmse:0.533334
[1956]	train-rmse:0.531448	valid-rmse:0.533244
[1957]	train-rmse:0.531376	valid-rmse:0.533151
[1958]	train-rmse:0.531309	valid-rmse:0.533075
[1959]	train-rmse:0.531238	valid-rmse:0.533001
[1960]	train-rmse:0.531169	valid-rmse:0.532921
[1961]	train-rmse:0.531102	valid-rmse:0.532855
[1962]	train-rmse:0.531034	valid-rmse:0.532791
[1963]	train-rmse:0.530963	valid-rmse:0.532694
[1964]	train-rmse:0.530897	valid-rmse:0.532622
[1965]	train-rmse:0.53083	valid-rmse:0.532557
[1966]	train-rmse:0.530761	valid-rmse:0.532479
[1967]	train-rmse:0.530688	valid-rmse:0.532398
[1968]	train-rmse:0.530621	valid-rmse:0.53233
[1969]	train-rmse:0.530549	valid-rmse:0.532257
[1970]	train-rmse:0.530483	valid-rmse:0.532187
[1971]	train-rmse:0.530413	valid-rmse:0.53212
[1972]	train-rmse:0.530342	valid-rmse:0.532035
[1973]	train-rmse:0.530276	valid-rmse:0.531964
[1974]	train-rmse:0.530207	valid-rmse:0.531895
[1975]	train-rms

KeyboardInterrupt: 

In [20]:
pickle.dump(xg_model, open( "pickled/xg_model", "wb"), protocol=4)

#xg_model = pickle.load( open( "pickled/xg_model", "rb" ) )

In [21]:
lg_features = best_cb_features


gc.collect()
lgtrain = lgbm.Dataset(x_train[lg_features], label=y_train)
lgval = lgbm.Dataset(x_val[lg_features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 16,
        "device": "gpu",
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        #"max_bin": 10,#default 255
        #"num_leaves": 10, #default 31
        #"bagging_fraction": 0.7,
        #"bagging_freq": 1,
        #"min_data_in_leaf": 50000,
        #"feature_fraction": 0.7,
        #"lambda_l2": 3,
        #"max_depth": 2,
        #"min_gain_to_split": 10,
        "learning_rate" : 0.001,
        #"histogram_pool_size": 1000,
        #"categorical_column": [0,1,2,3,4]
}

evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 20000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=10, 
                      evals_result=evals_result)



Training until validation scores don't improve for 100 rounds.
[10]	valid_0's rmse: 0.900989
[20]	valid_0's rmse: 0.89913
[30]	valid_0's rmse: 0.897319
[40]	valid_0's rmse: 0.895552
[50]	valid_0's rmse: 0.893811
[60]	valid_0's rmse: 0.892121
[70]	valid_0's rmse: 0.890454
[80]	valid_0's rmse: 0.888839
[90]	valid_0's rmse: 0.887264
[100]	valid_0's rmse: 0.885691
[110]	valid_0's rmse: 0.884163
[120]	valid_0's rmse: 0.882629
[130]	valid_0's rmse: 0.88112
[140]	valid_0's rmse: 0.87962
[150]	valid_0's rmse: 0.87816
[160]	valid_0's rmse: 0.876703
[170]	valid_0's rmse: 0.875295
[180]	valid_0's rmse: 0.873887
[190]	valid_0's rmse: 0.872521
[200]	valid_0's rmse: 0.871154
[210]	valid_0's rmse: 0.869823
[220]	valid_0's rmse: 0.868482
[230]	valid_0's rmse: 0.867166
[240]	valid_0's rmse: 0.865872
[250]	valid_0's rmse: 0.864607
[260]	valid_0's rmse: 0.863334
[270]	valid_0's rmse: 0.862117
[280]	valid_0's rmse: 0.860869
[290]	valid_0's rmse: 0.859629
[300]	valid_0's rmse: 0.858423
[310]	valid_0's rmse

[2600]	valid_0's rmse: 0.777565
[2610]	valid_0's rmse: 0.777435
[2620]	valid_0's rmse: 0.777323
[2630]	valid_0's rmse: 0.777199
[2640]	valid_0's rmse: 0.777106
[2650]	valid_0's rmse: 0.776995
[2660]	valid_0's rmse: 0.776876
[2670]	valid_0's rmse: 0.776831
[2680]	valid_0's rmse: 0.776716
[2690]	valid_0's rmse: 0.776609
[2700]	valid_0's rmse: 0.776497
[2710]	valid_0's rmse: 0.776411
[2720]	valid_0's rmse: 0.776331
[2730]	valid_0's rmse: 0.776217
[2740]	valid_0's rmse: 0.776096
[2750]	valid_0's rmse: 0.775994
[2760]	valid_0's rmse: 0.775902
[2770]	valid_0's rmse: 0.775794
[2780]	valid_0's rmse: 0.77575
[2790]	valid_0's rmse: 0.775642
[2800]	valid_0's rmse: 0.77558
[2810]	valid_0's rmse: 0.775519
[2820]	valid_0's rmse: 0.775566
[2830]	valid_0's rmse: 0.775547
[2840]	valid_0's rmse: 0.775493
[2850]	valid_0's rmse: 0.775443
[2860]	valid_0's rmse: 0.775468
[2870]	valid_0's rmse: 0.775472
[2880]	valid_0's rmse: 0.775527
[2890]	valid_0's rmse: 0.775497
[2900]	valid_0's rmse: 0.775519
[2910]	val

In [22]:
pickle.dump(model_lgb, open( "pickled/model_lgb", "wb"), protocol=4)

#model_lgb = pickle.load( open( "pickled/model_lgb", "rb" ) )

In [47]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
#x_train[cb_features].sample(10)
training.dtypes

item_id                                     int16  
shop_id                                     uint8  
date_block_num                              uint8  
item_category_id                            uint8  
month                                       uint8  
year                                        uint16 
item_first_block                            uint8  
item_last_block                             uint8  
is_first_two_blocks                         object 
is_last_two_blocks                          object 
item_units                                  float32
item_mean_units_block                       float32
item_day_units                              int16  
item_mean_units_day                         float32
item_max_units_block                        int16  
item_min_units_block                        int16  
item_max_units_day                          int16  
item_min_units_day                          int8   
item_turnover                               int32  
item_mean_tu

In [230]:
test = test.set_index('item_id').join(items.set_index('item_id'))
test['date_block_num'] = 34
test['month'] = 11

In [231]:
transactions_items = pd.read_pickle("pickled/transactions_items")
transactions_items_dtypes = transactions_items.dtypes
test = pd.merge(test, transactions_items[transactions_items_columns].drop_duplicates('item_id'), on=['item_id'], how='left', copy=False)

del transactions_items
fillnas(test, transactions_items_columns, transactions_items_dtypes)
downcast(test, transactions_items_columns, transactions_items_dtypes)
gc.collect()

158

In [None]:
test.head()

In [232]:
transactions_items_blocks = pd.read_pickle("pickled/transactions_items_blocks")
transactions_items_blocks_dtypes = transactions_items_blocks.dtypes

test = pd.merge(test, transactions_items_blocks[transactions_items_blocks_columns]\
                    .drop_duplicates(['item_id', 'date_block_num']), on=['item_id','date_block_num'], how='left', copy=False)

del transactions_items_blocks
test.fillna(0, inplace=True)
downcast(test, transactions_items_blocks_columns, transactions_items_blocks_dtypes)
gc.collect()

21

In [233]:
transactions_categories = pd.read_pickle("pickled/transactions_categories")
transactions_categories_dtypes = transactions_categories.dtypes
test = pd.merge(test, transactions_categories[transactions_categories_columns]\
                    .drop_duplicates('item_category_id'), on=['item_category_id'], how='left', copy=False)

del transactions_categories
fillnas(test, transactions_categories_columns, transactions_categories_dtypes)
downcast(test, transactions_categories_columns, transactions_categories_dtypes)
gc.collect()

21

In [234]:
transactions_categories_blocks = pd.read_pickle("pickled/transactions_categories_blocks")
transactions_categories_blocks_dtypes = transactions_categories_blocks.dtypes
test = pd.merge(test, transactions_categories_blocks[transactions_categories_blocks_columns]\
                    .drop_duplicates(['item_category_id', 'date_block_num']), on=['item_category_id', 'date_block_num'], how='left', copy=False)

del transactions_categories_blocks
fillnas(test, transactions_categories_blocks_columns, transactions_categories_blocks_dtypes)
downcast(test, transactions_categories_blocks_columns, transactions_categories_blocks_dtypes)
gc.collect()

21

In [235]:
transactions_shops = pd.read_pickle("pickled/transactions_shops")
transactions_shops_dtypes = transactions_shops.dtypes
test = pd.merge(test, transactions_shops[transactions_shops_columns]\
                    .drop_duplicates('shop_id'), on=['shop_id'], how='left', copy=False)

del transactions_shops
fillnas(test, transactions_shops_columns, transactions_shops_dtypes)
downcast(test, transactions_shops_columns, transactions_shops_dtypes)
gc.collect()

21

In [236]:
transactions_shops_blocks = pd.read_pickle("pickled/transactions_shops_blocks")
transactions_shops_blocks_dtypes = transactions_shops_blocks.dtypes
test = pd.merge(test, transactions_shops_blocks[transactions_shops_blocks_columns]\
                    .drop_duplicates(['shop_id', 'date_block_num']), on=['shop_id', 'date_block_num'], how='left', copy=False)

del transactions_shops_blocks
fillnas(test, transactions_shops_blocks_columns, transactions_shops_blocks_dtypes)
downcast(test, transactions_shops_blocks_columns, transactions_shops_blocks_dtypes)
gc.collect()

21

In [237]:
transactions_shops_categories = pd.read_pickle("pickled/transactions_shops_categories")
transactions_shops_categories_dtypes = transactions_shops_categories.dtypes
test = pd.merge(test, transactions_shops_categories[transactions_shops_categories_columns]\
                    .drop_duplicates(['shop_id','item_category_id']), on=['shop_id','item_category_id'], how='left', copy=False)

del transactions_shops_categories
fillnas(test, transactions_shops_categories_columns, transactions_shops_categories_dtypes)
downcast(test, transactions_shops_categories_columns, transactions_shops_categories_dtypes)
gc.collect()

21

In [238]:
transactions_shops_categories_blocks = pd.read_pickle("pickled/transactions_shops_categories_blocks")
transactions_shops_categories_blocks_dtypes = transactions_shops_categories_blocks.dtypes
test = pd.merge(test, transactions_shops_categories_blocks[transactions_shops_categories_blocks_columns]\
                    .drop_duplicates(['shop_id','item_category_id','date_block_num']),\
                    on=['shop_id','item_category_id','date_block_num'], how='left', copy=False)

del transactions_shops_categories_blocks
fillnas(test, transactions_shops_categories_blocks_columns, transactions_shops_categories_blocks_dtypes)
downcast(test, transactions_shops_categories_blocks_columns, transactions_shops_categories_blocks_dtypes)
gc.collect()

21

In [239]:
transactions_shops_items = pd.read_pickle("pickled/transactions_shops_items")
transactions_shops_items_dtypes = transactions_shops_items.dtypes
test = pd.merge(test, transactions_shops_items[transactions_shops_items_columns]\
                    .drop_duplicates(['shop_id','item_id']), on=['shop_id','item_id'], how='left', copy=False)

del transactions_shops_items
fillnas(test, transactions_shops_items_columns, transactions_shops_items_dtypes)
downcast(test, transactions_shops_items_columns, transactions_shops_items_dtypes)
gc.collect()

21

In [240]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 ]

idx_columns = ['item_id']

test = add_lag_features(test,lag_columns,idx_columns)

1
2
3


In [242]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

lag_columns = [
 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block'
 ]

idx_columns = ['item_category_id']

test = add_lag_features(test,lag_columns,idx_columns)

1
2
3


In [243]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

lag_columns = [
 'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block',
 ]

idx_columns = ['shop_id']


test = add_lag_features(test,lag_columns,idx_columns)

1
2
3


In [None]:
#gc.collect()
#training.to_pickle("pickled/training_mid_lags")
#training = pd.read_pickle("pickled/training_mid_lags")

In [244]:
lag_columns = [
  'shop_category_block_units',
 'shop_category_turnover',
 'shop_category_mean_price_block'
 ]

idx_columns = ['shop_id','item_category_id']


test = add_lag_features(test,lag_columns,idx_columns)

1
2
3


In [245]:
test.drop(columns=['lagged_block'],inplace=True)

In [246]:
for i in range(1,13):
    test[str(i)] = False

test['11'] = True


In [269]:
cb_preds = cb_model.predict(test[cb_features])
cb_preds.clip(0,20,out=cb_preds)

array([0.10687569, 0.02176194, 0.19470051, ..., 0.13769774, 0.19470051,
       0.05970212])

In [73]:
lg_preds = model_lgb.predict(test[lg_features])
lg_preds.clip(0,20,out=lg_preds)

array([0.33054925, 0.13510253, 0.42999917, ..., 0.40670206, 0.39667443,
       0.38160782])

In [74]:

xg_preds = xg_model.predict(xgb.DMatrix(test[xg_features]))
xg_preds.clip(0,20,out=xg_preds)

array([0.37465316, 0.21515614, 0.47788906, ..., 0.4456131 , 0.40560868,
       0.45044884], dtype=float32)

In [81]:
preds = np.mean(np.array([cb_preds,lg_preds,xg_preds]),axis=0)

In [270]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)

In [13]:
transactions.item_id.nunique()

17054

In [17]:
len(np.intersect1d(test.item_id.unique(), transactions.item_id.unique()))

4722

In [13]:
unique_test = list(set(list(zip(test.shop_id, test.item_id))))
unique_train = list(set(list(zip(transactions.shop_id, transactions.item_id))))

def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

unique_test_dict = {}
for t in unique_test:
    unique_test_dict[tuple2key(t)] = 1
    
i = 0
for t in unique_train:
    if tuple2key(t) in unique_test_dict:
        i+=1
        
print("%age of shop/item combinations in test that are in train:", i*100/len(unique_test))

%age of shop/item combinations in test that are in train: 49.54575163398693


In [44]:
len(list(set(list(zip(x_val.shop_id, x_val.item_id))))) / 2

189407.5

In [None]:
x_val_not_seen = x_val.loc[0:189408,:]
x_val_not_seen_tuples = list(set(list(zip(x_val_not_seen.shop_id, x_val_not_seen.item_id))))

x_val_not_seen_dict = {}
for t in x_val_not_seen_tuples:
    x_val_not_seen_dict[tuple2key(t)] = 1
    

def hide(shop_id, item_id):
    return tuple2key((shop_id, item_id)) in x_val_not_seen_dict
 
    
x_train['hidden'] = x_train.apply(lambda row: hide(row['shop_id'], row['item_id']), axis=1)

In [20]:
x_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,month,year,item_first_block,item_last_block,is_first_two_blocks,is_last_two_blocks,...,3,4,5,6,7,8,9,10,11,12
0,54,0,12,40,1,2014,20,20,False,True,...,False,False,False,False,False,False,False,False,False,False
1,54,0,13,40,2,2014,20,20,False,True,...,False,False,False,False,False,False,False,False,False,False
2,54,0,14,40,3,2014,20,20,False,True,...,True,False,False,False,False,False,False,False,False,False
3,54,0,15,40,4,2014,20,20,False,True,...,False,True,False,False,False,False,False,False,False,False
4,54,0,16,40,5,2014,20,20,False,True,...,False,False,True,False,False,False,False,False,False,False
