In [117]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [118]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


In [119]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [120]:
items['item_id'] = pd.to_numeric(items['item_id'],downcast='unsigned')
items['item_category_id'] = pd.to_numeric(items['item_category_id'],downcast='unsigned')

In [121]:
sales_train['date'] = sales_train['date'].astype('category')
sales_train['date_block_num'] = pd.to_numeric(sales_train['date_block_num'],downcast='unsigned')
sales_train['shop_id'] = pd.to_numeric(sales_train['shop_id'],downcast='unsigned')
sales_train['item_price'] = sales_train['item_price'].astype('int')
sales_train['item_price'] = pd.to_numeric(sales_train['item_price'],downcast='unsigned')
sales_train['item_cnt_day'] = pd.to_numeric(sales_train['item_cnt_day'],downcast='signed')


In [122]:
transactions = sales_train
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)
transactions['day'] = pd.to_numeric(transactions['day'],downcast='unsigned')
transactions['month'] = pd.to_numeric(transactions['month'],downcast='unsigned')
transactions['year'] = pd.to_numeric(transactions['year'],downcast='unsigned')


In [123]:
transactions = transactions.set_index('item_id').join(items.set_index('item_id'))
transactions.reset_index(inplace=True)
transactions = transactions[transactions['year'] != 2013]
transactions['y'] = pd.to_numeric(transactions.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].transform('sum')\
                .clip(0,20), downcast='unsigned')

In [124]:
transactions['y'].dtype

dtype('uint8')

In [125]:
len(transactions)

1668287

In [126]:
transactions['turnover'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['turnover'] = pd.to_numeric(transactions['turnover'], downcast='unsigned')

In [127]:
transactions['item_first_block'] = pd.to_numeric(transactions.groupby('item_id')['date_block_num'].transform(np.min), downcast='unsigned')
transactions['item_last_block'] = pd.to_numeric(transactions.groupby('item_id')['date_block_num'].transform(np.max), downcast='unsigned')

transactions['is_first_two_blocks'] = pd.to_numeric(\
                    transactions['date_block_num'].isin([transactions['item_first_block']+1,transactions['item_first_block']+2])\
                          , downcast='unsigned')


transactions['is_last_two_blocks'] = pd.to_numeric(\
                transactions['date_block_num'].isin([transactions['item_last_block']-1,transactions['item_last_block']])\
                                 , downcast='unsigned')                    


In [128]:
number_of_items = transactions['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = transactions['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = transactions['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = transactions['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = transactions['item_cnt_day'].sum()
print("total_sales:", total_sales)
total_turnover = transactions['turnover'].sum()
print("total_turnover:", total_turnover)
average_price = transactions['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473
total_turnover: 2181307117
average_price: 1015.4701882829513


#ITEM

-UNITS
item_units
item_block_units
item_mean_units_block
item_day_units
item_mean_units_day
item_max_units_block
item_min_units_block
item_max_units_day
item_min_units_day

-TURNOVER
item_turnover
item_block_turnover
item_mean_turnover_block
item_day_turnover
item_mean_turnover_day
item_max_turnover_block
item_min_turnover_block
item_max_turnover_day
item_min_turnover_day


-TIME
item_days_of_activity
item_blocks_of_activity
item_mean_day_between_activity
item_longest_stretch_days_without_activity
item_longest_stretch_blocks_without_activity
item_longest_stretch_block_with_activity
item_number_of_consecutive_days_with_activity
item_days_between_start_and_first_activity
item_blocks_between_start_and_first_activity
item_first_block
item_last_block
item_first_day
item_last_day
item_activity_on_all_blocks


-PRICE
item_mean_price
item_mean_price_block
item_min_price
item_max_price
item_number_different_prices
item_price_amplitude (%age min/max)
item_deviation_mean_category_price


-TREND
is_first_two_full_blocks (actually second/third to make sure we have a "full" block if this was a new release !!!!
is_last_two_blocks
item_first_two_blocks_units
item_last_two_blocks_units
item_fluctuation_units_first_last_blocks
item_first_two_blocks_mean_price
item_last_two_blocks_mean_price
item_fluctuation_price_first_last_blocks

-ENCODINGS
item_share_of_total_units
item_share_of_total_gross
item_share_of_category_units
item_share_of_category_turnover

In [196]:
gc.collect()
transactions_items = transactions.copy()
transactions_items_blocks = transactions.copy()

In [197]:
transactions_items_blocks['item_block_units'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')    
transactions_items_blocks['item_block_turnover'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')    
transactions_items_blocks['item_mean_price_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float')    

In [198]:
transactions_items['item_units'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.mean), downcast='float') 
transactions_items['item_day_units'] = pd.to_numeric(transactions_items.groupby(['item_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.mean), downcast='float') 
transactions_items['item_max_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_units_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_units_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_units'].transform(np.min), downcast='unsigned') 

In [199]:
transactions_items['item_turnover'] = pd.to_numeric(transactions_items.groupby(['item_id'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.mean), downcast='float') 
transactions_items['item_day_turnover'] = pd.to_numeric(transactions_items.groupby(['item_id','date'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_mean_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['turnover'].transform(np.mean), downcast='float') 
transactions_items['item_max_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_turnover_block'] = pd.to_numeric(transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.max), downcast='unsigned') 
transactions_items['item_min_turnover_day'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.min), downcast='unsigned') 

In [200]:
transactions_items['item_days_of_activity'] = pd.to_numeric(transactions_items.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
transactions_items['item_blocks_of_activity'] = pd.to_numeric(transactions_items.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

transactions_items['item_days_since_start'] = pd.to_numeric(transactions_items.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

transactions_items['item_mean_day_between_activity'] = pd.to_numeric(transactions_items['item_id'].map(average_days_between_sales), downcast='unsigned') 


def get_max_stretch_without_sales_days(days):
    days = np.unique(days)
    max_stretch = 0
    len_days = len(days)
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return max_stretch
        next_day = days[index+1]
        stretch = next_day - day
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales_day = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_max_stretch_without_sales_days(x))

transactions_items['item_longest_stretch_days_without_activity'] = pd.to_numeric(transactions_items['item_id'].map(max_stretch_without_sales_day), downcast='unsigned') 

In [201]:
gc.collect()

def get_max_stretch_without_sales_block(blocks):
    blocks = np.unique(blocks)
    max_stretch = 0
    len_blocks = len(blocks)
    for index,block in enumerate(sorted(blocks)):
        if index == len_blocks - 1:
            return max_stretch
        next_block = blocks[index+1]
        stretch = next_block - block
        if stretch > max_stretch:
            max_stretch = stretch
            

        
item_longest_stretch_blocks_without_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_block(x))

transactions_items['item_longest_stretch_blocks_without_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_longest_stretch_blocks_without_activity), downcast='unsigned') 



def get_longest_stretch(following_pairs, n=1,new_n=1):
    #print("following_pairs", following_pairs, " n: ", n, " new_n: ", new_n)
    len_pairs = len(following_pairs)
    if len_pairs == 0:
        return 0
    if len_pairs == 1:
        if new_n > n:
            return new_n
        return n
    if following_pairs[1][0] == following_pairs[0][1]:
        new_n+=1
    else:
        if new_n > n:
            n=new_n
        new_n=1
    return get_longest_stretch(following_pairs[1:], n,new_n)


assert(get_longest_stretch([]) == 0)
assert(get_longest_stretch([[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[-1, 0],[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[20, 21], [25,26]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16],[18,19] ,[22,23], [23,24],[24, 25]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5], [7, 8]]) == 1)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [20, 21], [21, 22], [22,23],[23,24]]) == 4)


def get_following_pairs(pairs):
    pairs = np.unique(pairs)
    len_pairs = len(pairs)
    following = []
    for index,pair in enumerate(sorted(pairs)):
        if index == len_pairs - 1:
            return following
        next_pair = pairs[index+1]
        if next_pair == pair + 1:
            following.append([pair, next_pair])
        
assert(get_following_pairs([1,2,5,6,7,8,9,11,12,15]) == [[1, 2], [5, 6], [6, 7], [7, 8], [8, 9], [11, 12]])
assert(get_following_pairs([1,2,5,6,7,10]) == [[1, 2], [5, 6], [6, 7]])
assert(get_following_pairs([1,2,4,5,7,9,10]) == [[1, 2], [4, 5], [9,10]])
assert(get_following_pairs([1,2,4,5,7,9,10,11,12,15]) == [[1, 2], [4, 5], [9,10],[10,11],[11,12]])


item_longest_stretch_block_with_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions_items['item_longest_stretch_block_with_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_longest_stretch_block_with_activity), downcast='unsigned') 


item_number_of_consecutive_days_with_activity = transactions_items.groupby(['item_id'])['item_days_since_start']\
                                    .apply(list).apply(lambda x: len(get_following_pairs(x)))
    
transactions_items['item_number_of_consecutive_days_with_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_number_of_consecutive_days_with_activity), downcast='unsigned') 

In [202]:
def get_units_between_first_and_last(units):
    return np.max(units) - np.min(units)

item_days_between_start_and_first_activity = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_days_between_start_and_first_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_days_between_start_and_first_activity), downcast='unsigned') 

item_blocks_between_start_and_first_activity = transactions_items.groupby(['item_id'])['date_block_num'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_blocks_between_start_and_first_activity'] = pd.to_numeric(transactions_items['item_id'].map(item_blocks_between_start_and_first_activity), downcast='unsigned') 

In [203]:

transactions_items['item_first_day'] = pd.to_numeric(transactions_items.groupby('item_id')['item_days_since_start'].transform(np.min), downcast='unsigned') 
transactions_items['item_last_day'] = pd.to_numeric(transactions_items.groupby('item_id')['item_days_since_start'].transform(np.max), downcast='unsigned') 

item_activity_on_all_blocks = transactions_items.groupby('item_id')['date_block_num'].nunique().apply(lambda x: x==number_of_blocks)
transactions_items['item_activity_on_all_blocks'] = transactions_items['item_id'].map(item_activity_on_all_blocks)

In [204]:
transactions_items['item_mean_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.mean), downcast='float') 
transactions_items['item_min_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.min), downcast='unsigned') 
transactions_items['item_max_price'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform(np.max), downcast='unsigned') 
transactions_items['item_number_different_prices'] = pd.to_numeric(transactions_items.groupby('item_id')['item_price'].transform('nunique'), downcast='unsigned') 
transactions_items['item_price_amplitude'] = pd.to_numeric(((transactions_items['item_max_price'] - transactions_items['item_min_price'] ) / transactions_items['item_min_price']) * 100, downcast='float') 
transactions_items['category_mean_price'] = pd.to_numeric(transactions_items.groupby('item_category_id')['item_price'].transform(np.mean), downcast='float') 
transactions_items['item_deviation_mean_category_price'] =  pd.to_numeric(((transactions_items['item_mean_price'] - transactions_items['category_mean_price'] ) / transactions_items['category_mean_price']) * 100, downcast='float') 

In [205]:
item_first_two_blocks_units = transactions_items.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_first_two_blocks_units = item_first_two_blocks_units[item_first_two_blocks_units['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_first_two_blocks_units'] = pd.to_numeric(transactions_items['item_id'].map(item_first_two_blocks_units), downcast='unsigned') 

item_last_two_blocks_units = transactions_items.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_last_two_blocks_units = item_last_two_blocks_units[item_last_two_blocks_units['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_last_two_blocks_units'] = pd.to_numeric(transactions_items['item_id'].map(item_last_two_blocks_units), downcast='unsigned') 

transactions_items['item_fluctuation_units_first_last_blocks'] =  pd.to_numeric(((transactions_items['item_first_two_blocks_units'] - transactions_items['item_last_two_blocks_units'] ) / \
                                                             transactions_items['item_first_two_blocks_units']) * 100 * -1, downcast='float') 


item_first_two_blocks_mean_price = transactions_items.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
item_first_two_blocks_mean_price = item_first_two_blocks_mean_price[item_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_first_two_blocks_mean_price'] = pd.to_numeric(transactions_items['item_id'].map(item_first_two_blocks_mean_price), downcast='unsigned') 

item_last_two_blocks_mean_price = transactions_items.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
item_last_two_blocks_mean_price = item_last_two_blocks_mean_price[item_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_last_two_blocks_mean_price'] = pd.to_numeric(transactions_items['item_id'].map(item_last_two_blocks_mean_price), downcast='unsigned') 

transactions_items['item_fluctuation_price_first_last_blocks'] =  pd.to_numeric(((transactions_items['item_first_two_blocks_mean_price'] - transactions_items['item_last_two_blocks_mean_price'] ) / \
                                                             transactions_items['item_first_two_blocks_mean_price'])  * 100 * -1, downcast='float') 

In [206]:
transactions_items['item_share_of_total_units'] = pd.to_numeric(transactions_items['item_units'] * 100 / total_sales , downcast='float') 

transactions_items['item_share_of_total_turnover'] = pd.to_numeric(transactions_items['item_turnover'] * 100 / total_turnover, downcast='float') 

transactions_items['category_units'] = pd.to_numeric(transactions_items.groupby('item_category_id')['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_items['item_share_of_category_units'] = pd.to_numeric(transactions_items['item_units'] * 100 / transactions_items['category_units'], downcast='float') 

transactions_items['category_turnover'] = pd.to_numeric(transactions_items.groupby('item_category_id')['turnover'].transform(np.sum), downcast='unsigned') 
transactions_items['item_share_of_category_turnover'] = pd.to_numeric(transactions_items['item_turnover'] * 100 / transactions_items['category_turnover'], downcast='float') 


In [207]:
transactions_items.to_pickle("pickled/transactions_items")
transactions_items_blocks.to_pickle("pickled/transactions_items_blocks")

del transactions_items
del transactions_items_blocks
gc.collect()

346

#CATEGORY

-UNITS
category_units
category_block_units
category_mean_units_block
category_day_units
category_mean_units_day
category_max_units_block
category_min_units_block
category_max_units_day
category_min_units_day

-TURNOVER
category_turnover
category_block_turnover
category_mean_turnover_block
category_day_turnover
category_mean_turnover_day
category_max_turnover_block
category_min_turnover_block
category_max_turnover_day
category_min_turnover_day


-PRICE
category_mean_price
category_mean_price_block
category_min_price
category_max_price


-TREND
category_first_two_blocks_units
category_last_two_blocks_units
category_fluctuation_units_first_last_blocks
category_first_two_blocks_mean_price
category_last_two_blocks_mean_price
category_fluctuation_price_first_last_blocks

-SUBCATEGORY
subcategory
subcategory 1hot

-UNITS
subcategory_units
subcategory_block_units
subcategory_mean_units_block
subcategory_day_units
subcategory_mean_units_day
subcategory_max_units_block
subcategory_min_units_block
subcategory_max_units_day
subcategory_min_units_day

-TURNOVER
subcategory_turnover
subcategory_block_turnover
subcategory_mean_turnover_block
subcategory_day_turnover
subcategory_mean_turnover_day
subcategory_max_turnover_block
subcategory_min_turnover_block
subcategory_max_turnover_day
subcategory_min_turnover_day

-ENCODINGS
category_share_of_total_units
category_share_of_total_gross
subcategory_share_of_total_units
subcategory_share_of_total_gross

-TREND
subcategory_first_two_blocks_units
subcategory_last_two_blocks_units
subcategory_fluctuation_units_first_last_blocks
subcategory_first_two_blocks_mean_price
subcategory_last_two_blocks_mean_price
subcategory_fluctuation_price_first_last_blocks

In [208]:
gc.collect()
transactions_categories = transactions.copy()
transactions_categories_blocks = transactions.copy()

In [209]:
sub_cats = {}
for i in range(1,8):
    sub_cats[i] = "Accessories"
sub_cats[8] = "Tickets"
sub_cats[9] = "Delivery of goods"
for i in range(10,18):
    sub_cats[i] = "Consoles"
for i in range(18,25):
    sub_cats[i] = "Game for Consoles"
sub_cats[25] = "Accessories for Games"
sub_cats[26] = "Android Games"
sub_cats[27] = "MAC Games"
for i in range(28,32):
    sub_cats[i] = "PC Games"
for i in range(32,37):
    sub_cats[i] = "Payment Cards"
for i in range(37,40):
    sub_cats[i] = "Cinema - Blu-ray"
sub_cats[40] = "Cinema - DVD"
sub_cats[41] = "Cinema - Collectible"
for i in range(42,46):
    sub_cats[i] = "Audiobooks"
for i in range(46,55):
    sub_cats[i] = "Books"
for i in range(55,57):
    sub_cats[i] = "Music - CD"
sub_cats[57] = "Music - MP3"
sub_cats[58] = "Music - Vinyl"
sub_cats[59] = "Music - Music Video"
sub_cats[60] = "Music - Gift Edition"
for i in range(61,74):
    sub_cats[i] = "Gifts"
for i in range(73,79):
    sub_cats[i] = "Software"
sub_cats[79] = "Utility"
for i in range(80,84):
    sub_cats[i] = "Misc"
    
    
transactions_categories['subcategory'] = transactions_categories['item_category_id'].apply(lambda x: sub_cats[x]).astype('category')
transactions_categories_blocks['subcategory'] = transactions_categories_blocks['item_category_id'].apply(lambda x: sub_cats[x]).astype('category')

In [210]:
transactions_categories_blocks['category_block_units'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['category_block_turnover'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['category_mean_price_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 
transactions_categories_blocks['subcategory_block_units'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['subcategory_block_turnover'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories_blocks['subcategory_mean_price_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 


In [211]:
transactions_categories['category_units'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.mean), downcast='float') 
transactions_categories['category_day_units'] = pd.to_numeric(transactions_categories.groupby(['item_category_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.mean), downcast='float') 
transactions_categories['category_max_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.min), downcast='unsigned') 
transactions_categories['category_max_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_units_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.min), downcast='unsigned') 

In [212]:
transactions_categories['category_turnover'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.mean), downcast='float') 
transactions_categories['category_day_turnover'] = pd.to_numeric(transactions_categories.groupby(['item_category_id','date'])['turnover'].transform(np.sum), downcast='unsigned') 
transactions_categories['category_mean_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.mean), downcast='float') 
transactions_categories['category_max_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.min), downcast='unsigned') 
transactions_categories['category_max_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.max), downcast='unsigned') 
transactions_categories['category_min_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.min), downcast='unsigned') 

In [213]:
transactions_categories['category_mean_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.mean), downcast='float') 
transactions_categories['category_min_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.min), downcast='unsigned')
transactions_categories['category_max_price'] = pd.to_numeric(transactions_categories.groupby('item_category_id')['item_price'].transform(np.max), downcast='unsigned')

In [214]:
category_first_two_blocks_units = transactions_categories.groupby(['item_category_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
category_first_two_blocks_units = category_first_two_blocks_units[category_first_two_blocks_units['is_first_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_first_two_blocks_units'] = pd.to_numeric(transactions_categories['item_category_id'].map(category_first_two_blocks_units), downcast='unsigned')

category_last_two_blocks_units = transactions_categories.groupby(['item_category_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
category_last_two_blocks_units = category_last_two_blocks_units[category_last_two_blocks_units['is_last_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_last_two_blocks_units'] = pd.to_numeric(transactions_categories['item_category_id'].map(category_last_two_blocks_units), downcast='unsigned')

transactions_categories['category_fluctuation_units_first_last_blocks'] =  pd.to_numeric(((transactions_categories['category_first_two_blocks_units'] - transactions_categories['category_last_two_blocks_units'] ) / \
                                                             transactions_categories['category_first_two_blocks_units']) * 100 * -1, downcast='float') 


category_first_two_blocks_mean_price = transactions_categories.groupby(['item_category_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
category_first_two_blocks_mean_price = category_first_two_blocks_mean_price[category_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_first_two_blocks_mean_price'] = pd.to_numeric(transactions_categories['item_category_id'].map(category_first_two_blocks_mean_price), downcast='unsigned')

category_last_two_blocks_mean_price = transactions_categories.groupby(['item_category_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
category_last_two_blocks_mean_price = category_last_two_blocks_mean_price[category_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_last_two_blocks_mean_price'] = pd.to_numeric(transactions_categories['item_category_id'].map(category_last_two_blocks_mean_price), downcast='unsigned')

transactions_categories['category_fluctuation_price_first_last_blocks'] =  pd.to_numeric(((transactions_categories['category_first_two_blocks_mean_price'] - transactions_categories['category_last_two_blocks_mean_price'] ) / \
                                                             transactions_categories['category_first_two_blocks_mean_price'])  * 100 * -1, downcast='float') 

In [215]:
transactions_categories['video_game'] = transactions_categories["item_category_id"].isin(list(range(18,32)))
transactions_categories['gaming_old_gen'] = transactions_categories["item_category_id"].isin([10,11,15,18,19,23])
transactions_categories['gaming_new_gen'] = transactions_categories["item_category_id"].isin([12,14,16,20,22,24])
transactions_categories['pc_games'] = transactions_categories["item_category_id"].isin(list(range(27,32)))
transactions_categories['payment_cards'] = transactions_categories["item_category_id"].isin(list(range(32,37)))
transactions_categories['movies'] = transactions_categories["item_category_id"].isin(list(range(37,42)))
transactions_categories['movies_niche'] = transactions_categories["item_category_id"].isin([38,39])
transactions_categories['books'] = transactions_categories["item_category_id"].isin([42,55])
transactions_categories['music'] = transactions_categories["item_category_id"].isin(list(range(55,61)))
transactions_categories['music_CD'] = transactions_categories["item_category_id"].isin([55,56])
transactions_categories['music_vinyl'] = transactions_categories["item_category_id"].isin([58])
transactions_categories['gifts'] = transactions_categories["item_category_id"].isin(list(range(61,72)))
transactions_categories['software'] = transactions_categories["item_category_id"].isin(list(range(73,79)))

In [216]:
transactions_categories['subcategory_units'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_mean_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_units'].transform(np.mean), downcast='float') 
transactions_categories['subcategory_day_units'] = pd.to_numeric(transactions_categories.groupby(['subcategory','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_mean_units_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.mean), downcast='float') 
transactions_categories['subcategory_max_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_units'].transform(np.max), downcast='unsigned')
transactions_categories['subcategory_min_units_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_units'].transform(np.min), downcast='unsigned')
transactions_categories['subcategory_max_units_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.max), downcast='unsigned')
transactions_categories['subcategory_min_units_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.min), downcast='unsigned')

In [217]:
transactions_categories['subcategory_turnover'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_mean_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.mean), downcast='float') 
transactions_categories['subcategory_day_turnover'] = pd.to_numeric(transactions_categories.groupby(['subcategory','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_mean_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.mean), downcast='float') 
transactions_categories['subcategory_max_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.max), downcast='unsigned')
transactions_categories['subcategory_min_turnover_block'] = pd.to_numeric(transactions_categories_blocks.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.min), downcast='unsigned')
transactions_categories['subcategory_max_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.max), downcast='unsigned')
transactions_categories['subcategory_min_turnover_day'] = pd.to_numeric(transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.min), downcast='unsigned')

In [218]:
transactions_categories['category_share_of_total_units'] = pd.to_numeric(transactions_categories['category_units'] * 100 / total_sales , downcast='float') 
transactions_categories['category_share_of_total_turnover'] = pd.to_numeric(transactions_categories['category_turnover']* 100 / total_turnover, downcast='float') 

transactions_categories['subcategory_units'] = pd.to_numeric(transactions_categories.groupby("subcategory")['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_share_of_total_units'] = pd.to_numeric(transactions_categories['subcategory_units'] * 100 / total_sales, downcast='float') 
transactions_categories['subcategory_turnover'] = pd.to_numeric(transactions_categories.groupby("subcategory")['turnover'].transform(np.sum), downcast='unsigned')
transactions_categories['subcategory_share_of_total_turnover'] = pd.to_numeric(transactions_categories['subcategory_turnover']* 100 / total_turnover, downcast='float') 

In [219]:
subcategory_first_two_blocks_units = transactions_categories.groupby(['subcategory','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
subcategory_first_two_blocks_units = subcategory_first_two_blocks_units[subcategory_first_two_blocks_units['is_first_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_first_two_blocks_units'] = pd.to_numeric(transactions_categories['subcategory'].map(subcategory_first_two_blocks_units), downcast='unsigned')

subcategory_last_two_blocks_units = transactions_categories.groupby(['subcategory','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
subcategory_last_two_blocks_units = subcategory_last_two_blocks_units[subcategory_last_two_blocks_units['is_last_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_last_two_blocks_units'] = pd.to_numeric(transactions_categories['subcategory'].map(subcategory_last_two_blocks_units), downcast='unsigned')

transactions_categories['subcategory_fluctuation_units_first_last_blocks'] =  pd.to_numeric(((transactions_categories['subcategory_first_two_blocks_units'] - transactions_categories['subcategory_last_two_blocks_units'] ) / \
                                                             transactions_categories['subcategory_first_two_blocks_units']) * 100 * -1, downcast='float') 


subcategory_first_two_blocks_mean_price = transactions_categories.groupby(['subcategory','is_first_two_blocks'], as_index=False)['item_price'].mean()
subcategory_first_two_blocks_mean_price = subcategory_first_two_blocks_mean_price[subcategory_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_first_two_blocks_mean_price'] = pd.to_numeric(transactions_categories['subcategory'].map(subcategory_first_two_blocks_mean_price), downcast='unsigned')

subcategory_last_two_blocks_mean_price = transactions_categories.groupby(['subcategory','is_last_two_blocks'], as_index=False)['item_price'].mean()
subcategory_last_two_blocks_mean_price = subcategory_last_two_blocks_mean_price[subcategory_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_last_two_blocks_mean_price'] = pd.to_numeric(transactions_categories['subcategory'].map(subcategory_last_two_blocks_mean_price), downcast='unsigned')

transactions_categories['subcategory_fluctuation_price_first_last_blocks'] =  pd.to_numeric(((transactions_categories['subcategory_first_two_blocks_mean_price'] - transactions_categories['subcategory_last_two_blocks_mean_price'] ) / \
                                                             transactions_categories['subcategory_first_two_blocks_mean_price'])  * 100 * -1, downcast='float') 

In [220]:

transactions_categories.to_pickle("pickled/transactions_categories")
transactions_categories_blocks.to_pickle("pickled/transactions_categories_blocks")

del transactions_categories
del transactions_categories_blocks
gc.collect()

198

#SHOP

-UNITS
shop_units
shop_block_units
shop_mean_units_block
shop_day_units
shop_mean_units_day
shop_max_units_block
shop_min_units_block
shop_max_units_day
shop_min_units_day

-TURNOVER
shop_turnover
shop_block_turnover
shop_mean_turnover_block
shop_day_turnover
shop_mean_turnover_day
shop_max_turnover_block
shop_min_turnover_block
shop_max_turnover_day
shop_min_turnover_day

-PRICE
shop_mean_price
shop_mean_price_block


-TREND
shop_first_two_blocks_units
shop_last_two_blocks_units
shop_fluctuation_units_first_last_blocks
shop_first_two_blocks_mean_price
shop_last_two_blocks_mean_price
shop_fluctuation_price_first_last_blocks

-ENCODINGS
shop_share_of_total_units
shop_share_of_total_gross

-MISC
shop_ids_TC
shop_ids_TRK
shop_ids_SEC
shop_ids_shopping_center
shop_ids_moscow

-CATEGORY
shop_top_category_units
shop_top_category_turnover
shop_top_subcategory_units
shop_top_subcategory_turnover

In [154]:
gc.collect()
transactions_shops = transactions.copy()
transactions_shops_blocks = transactions.copy()

In [155]:
shop_areas = {}
for i in range(0,2):
    shop_areas[i] = "Yakutsk"
shop_areas[2] = "Adygea"
shop_areas[3] = "Balashikha"
shop_areas[4] = "Volga"
shop_areas[5] = "Vologda"
for i in range(6,9):
    shop_areas[i] = "Voronezh"
shop_areas[9] = "Outbound Trading"
for i in range(10,12):
    shop_areas[i] = "Zhukovsky"
shop_areas[12] = "Online store emergency"
for i in range(13,15):
    shop_areas[i] = "Kazan"
shop_areas[15] = "Kaluga"
shop_areas[16] = "Kolomna"
for i in range(17,19):
    shop_areas[i] = "Krasnoyarsk"
shop_areas[19] = "Kursk"
for i in range(20,33):
    shop_areas[i] = "Moscow"
shop_areas[33] = "Mytishchi"
for i in range(34,36):
    shop_areas[i] = "N.Novgorod"
for i in range(36,38):
    shop_areas[i] = "Novosibirsk"
shop_areas[38] = "Omsk"
for i in range(39,42):
    shop_areas[i] = "RostovNaDonu"
for i in range(42,44):
    shop_areas[i] = "St. Petersburg"
for i in range(44,46):
    shop_areas[i] = "Samara"
shop_areas[46] = "Sergiev Posad"
shop_areas[47] = "Surgut"
shop_areas[48] = "Tomsk"
for i in range(49,52):
    shop_areas[i] = "Tyumen TC"
for i in range(52,54):
    shop_areas[i] = "Ufa"
shop_areas[54] = "Khimki"
shop_areas[55] = "Digital warehouse"
shop_areas[56] = "Chekhov"
for i in range(57,59):
    shop_areas[i] = "Yakutsk"
shop_areas[59] = "Yaroslavl"

transactions_shops['area'] = transactions_shops['shop_id'].apply(lambda x: shop_areas[x]).astype('category')

transactions_shops_blocks['area'] = transactions_shops['shop_id'].apply(lambda x: shop_areas[x]).astype('category')



In [156]:
transactions_shops_blocks['shop_block_units'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['shop_block_turnover'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['shop_mean_price_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 
transactions_shops_blocks['area_block_units'] = pd.to_numeric(transactions_shops_blocks.groupby(['area','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['area_block_turnover'] = pd.to_numeric(transactions_shops_blocks.groupby(['area','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_blocks['area_mean_price_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 


In [157]:
transactions_shops['shop_units'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.mean), downcast='float') 
transactions_shops['shop_day_units'] = pd.to_numeric(transactions_shops.groupby(['shop_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.mean), downcast='float') 
transactions_shops['shop_max_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.min), downcast='unsigned')
transactions_shops['shop_max_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_units_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.min), downcast='unsigned')

In [158]:
transactions_shops['shop_turnover'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.mean), downcast='float') 
transactions_shops['shop_day_turnover'] = pd.to_numeric(transactions_shops.groupby(['shop_id','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['shop_mean_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.mean), downcast='float') 
transactions_shops['shop_max_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.min), downcast='unsigned')
transactions_shops['shop_max_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['shop_min_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.min), downcast='unsigned')

In [159]:
transactions_shops['shop_mean_price'] = pd.to_numeric(transactions_shops.groupby('shop_id')['item_price'].transform(np.mean), downcast='float') 


In [160]:
shop_first_two_blocks_units = transactions_shops.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_first_two_blocks_units = shop_first_two_blocks_units[shop_first_two_blocks_units['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_first_two_blocks_units'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_first_two_blocks_units), downcast='unsigned')

shop_last_two_blocks_units = transactions_shops.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_last_two_blocks_units = shop_last_two_blocks_units[shop_last_two_blocks_units['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_last_two_blocks_units'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_last_two_blocks_units), downcast='unsigned')

transactions_shops['shop_fluctuation_units_first_last_blocks'] =  pd.to_numeric(((transactions_shops['shop_first_two_blocks_units'] - transactions_shops['shop_last_two_blocks_units'] ) / \
                                                             transactions_shops['shop_first_two_blocks_units']) * 100 * -1, downcast='float') 


shop_first_two_blocks_mean_price = transactions_shops.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
shop_first_two_blocks_mean_price = shop_first_two_blocks_mean_price[shop_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_first_two_blocks_mean_price'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_first_two_blocks_mean_price), downcast='unsigned')

shop_last_two_blocks_mean_price = transactions_shops.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
shop_last_two_blocks_mean_price = shop_last_two_blocks_mean_price[shop_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_last_two_blocks_mean_price'] = pd.to_numeric(transactions_shops['shop_id'].map(shop_last_two_blocks_mean_price), downcast='unsigned')

transactions_shops['shop_fluctuation_price_first_last_blocks'] =  pd.to_numeric(((transactions_shops['shop_first_two_blocks_mean_price'] - transactions_shops['shop_last_two_blocks_mean_price'] ) / \
                                                             transactions_shops['shop_first_two_blocks_mean_price'])  * 100 * -1, downcast='float') 

In [161]:
transactions_shops['shop_share_of_units'] = pd.to_numeric(transactions_shops['shop_units'] * 100 / total_sales, downcast='float') 
transactions_shops['shop_share_of_turnover'] = pd.to_numeric(transactions_shops['shop_turnover'] * 100 / total_turnover, downcast='float') 

In [162]:
shop_ids_TC = [1,2,13,14,16,23,24,26,28,31,37,38,42,43,44,46,50,54,58]
shop_ids_TRK = [3,33,39,40]
shop_ids_SEC = [7,34,36,47,48,49,56]
shop_ids_shopping_center = [4,5,8,15,17,18,19,27,29,30,32,41,45,51,53,59]
shop_ids_moscow = list(range(20,33))


transactions_shops['shop_TC'] = transactions_shops['shop_id'].isin(shop_ids_TC)
transactions_shops['shop_TRK'] = transactions_shops['shop_id'].isin(shop_ids_TRK)
transactions_shops['shop_SEC'] = transactions_shops['shop_id'].isin(shop_ids_SEC)
transactions_shops['shop_shopping_center'] = transactions_shops['shop_id'].isin(shop_ids_shopping_center)
transactions_shops['shop_moscow'] = transactions_shops['shop_id'].isin(shop_ids_moscow)

In [163]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()\
                  .groupby(['shop_id'])['item_cnt_day'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'item_cnt_day'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_units'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')

transactions_shops['max_category_units'] = pd.to_numeric(transactions_shops['max_category_units'], downcast='unsigned')

In [164]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()\
                  .groupby(['shop_id'])['turnover'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'turnover'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_turnover'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')
transactions_shops['max_category_turnover'] = pd.to_numeric(transactions_shops['max_category_turnover'], downcast='unsigned')

-AREA
area



-UNITS
area_units
area_block_units
area_mean_units_block
area_day_units
area_mean_units_day
area_max_units_block
area_min_units_block
area_max_units_day
area_min_units_day

-TURNOVER
area_turnover
area_block_turnover
area_mean_turnover_block
area_day_turnover
area_mean_turnover_day
area_max_turnover_block
area_min_turnover_block
area_max_turnover_day
area_min_turnover_day

-PRICE
area_mean_price
area_mean_price_block


-TREND
area_first_two_blocks_units
area_last_two_blocks_units
area_fluctuation_units_first_last_blocks
area_first_two_blocks_mean_price
area_last_two_blocks_mean_price
area_fluctuation_price_first_last_blocks

-ENCODINGS
area_share_of_total_units
area_share_of_total_gross

In [165]:
transactions_shops['area_units'] = pd.to_numeric(transactions_shops.groupby(['area'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['area_mean_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_units'].transform(np.mean), downcast='float') 
transactions_shops['area_day_units'] = pd.to_numeric(transactions_shops.groupby(['area','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops['area_mean_units_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_units'].transform(np.mean), downcast='float') 
transactions_shops['area_max_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_units'].transform(np.max), downcast='unsigned')
transactions_shops['area_min_units_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_units'].transform(np.min), downcast='unsigned')
transactions_shops['area_max_units_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_units'].transform(np.max), downcast='unsigned')
transactions_shops['area_min_units_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_units'].transform(np.min), downcast='unsigned')

In [166]:
transactions_shops['area_turnover'] = pd.to_numeric(transactions_shops.groupby(['area'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['area_mean_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_turnover'].transform(np.mean), downcast='float') 
transactions_shops['area_day_turnover'] = pd.to_numeric(transactions_shops.groupby(['area','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops['area_mean_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.mean), downcast='float') 
transactions_shops['area_max_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['area_min_turnover_block'] = pd.to_numeric(transactions_shops_blocks.groupby(['area'])['area_block_turnover'].transform(np.min), downcast='unsigned')
transactions_shops['area_max_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.max), downcast='unsigned')
transactions_shops['area_min_turnover_day'] = pd.to_numeric(transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.min), downcast='unsigned')

In [167]:
transactions_shops['area_mean_price'] = pd.to_numeric(transactions_shops.groupby('area')['item_price'].transform(np.mean), downcast='float') 


In [168]:
area_first_two_blocks_units = transactions_shops.groupby(['area','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
area_first_two_blocks_units = area_first_two_blocks_units[area_first_two_blocks_units['is_first_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_first_two_blocks_units'] = pd.to_numeric(transactions_shops['area'].map(area_first_two_blocks_units), downcast='unsigned')

area_last_two_blocks_units = transactions_shops.groupby(['area','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
area_last_two_blocks_units = area_last_two_blocks_units[area_last_two_blocks_units['is_last_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_last_two_blocks_units'] = pd.to_numeric(transactions_shops['area'].map(area_last_two_blocks_units), downcast='unsigned')

transactions_shops['area_fluctuation_units_first_last_blocks'] =  pd.to_numeric(((transactions_shops['area_first_two_blocks_units'] - transactions_shops['area_last_two_blocks_units'] ) / \
                                                             transactions_shops['area_first_two_blocks_units']) * 100 * -1, downcast='float') 


area_first_two_blocks_mean_price = transactions_shops.groupby(['area','is_first_two_blocks'], as_index=False)['item_price'].mean()
area_first_two_blocks_mean_price = area_first_two_blocks_mean_price[area_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_first_two_blocks_mean_price'] = pd.to_numeric(transactions_shops['area'].map(area_first_two_blocks_mean_price), downcast='unsigned')

area_last_two_blocks_mean_price = transactions_shops.groupby(['area','is_last_two_blocks'], as_index=False)['item_price'].mean()
area_last_two_blocks_mean_price = area_last_two_blocks_mean_price[area_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_last_two_blocks_mean_price'] = pd.to_numeric(transactions_shops['area'].map(area_last_two_blocks_mean_price), downcast='unsigned')

transactions_shops['area_fluctuation_price_first_last_blocks'] =  pd.to_numeric(((transactions_shops['area_first_two_blocks_mean_price'] - transactions_shops['area_last_two_blocks_mean_price'] ) / \
                                                             transactions_shops['area_first_two_blocks_mean_price'])  * 100 * -1, downcast='float') 

In [169]:

transactions_shops.to_pickle("pickled/transactions_shops")
transactions_shops_blocks.to_pickle("pickled/transactions_shops_blocks")


del transactions_shops
del transactions_shops_blocks
gc.collect()

282

shop_category


-UNITS
shop_category_units
shop_category_block_units
shop_category_mean_units_block
shop_category_day_units
shop_category_mean_units_day
shop_category_max_units_block
shop_category_min_units_block
shop_category_max_units_day
shop_category_min_units_day

-TURNOVER
shop_category_turnover
shop_category_block_turnover
shop_category_mean_turnover_block
shop_category_day_turnover
shop_category_mean_turnover_day
shop_category_max_turnover_block
shop_category_min_turnover_block
shop_category_max_turnover_day
shop_category_min_turnover_day

-PRICE
shop_category_mean_price
shop_category_mean_price_block


-TREND
shop_category_first_two_blocks_units
shop_category_last_two_blocks_units
shop_category_fluctuation_units_first_last_blocks
shop_category_first_two_blocks_mean_price
shop_category_last_two_blocks_mean_price
shop_category_fluctuation_price_first_last_blocks

-ENCODINGS
shop_category_share_of_total_units
shop_category_share_of_total_gross

In [170]:
gc.collect()
transactions_shops_categories = transactions.copy()
transactions_shops_categories_blocks = transactions.copy()

In [171]:
transactions_shops_categories_blocks['shop_category_block_units'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories_blocks['shop_category_block_turnover'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories_blocks['shop_category_mean_price_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id', 'date_block_num'])['item_price'].transform(np.mean), downcast='float') 

In [172]:
transactions_shops_categories['shop_category_units'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_day_units'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['item_cnt_day'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_max_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_units_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.min), downcast='unsigned')
transactions_shops_categories['shop_category_max_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_units_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.min), downcast='unsigned')


In [173]:
transactions_shops_categories['shop_category_turnover'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_day_turnover'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['turnover'].transform(np.sum), downcast='unsigned')
transactions_shops_categories['shop_category_mean_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.mean), downcast='float') 
transactions_shops_categories['shop_category_max_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_turnover_block'] = pd.to_numeric(transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.min), downcast='unsigned')
transactions_shops_categories['shop_category_max_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.max), downcast='unsigned')
transactions_shops_categories['shop_category_min_turnover_day'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.min), downcast='unsigned')

In [174]:
transactions_shops_categories['shop_category_mean_price'] = pd.to_numeric(transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_price'].transform(np.mean), downcast='float') 


In [175]:
transactions_shops_categories.to_pickle("pickled/transactions_shops_categories")

In [176]:
restored = pd.read_pickle("pickled/transactions_shops_categories")

In [177]:

transactions_shops_categories.to_pickle("pickled/transactions_shops_categories")
transactions_shops_categories_blocks.to_pickle("pickled/transactions_shops_categories_blocks")


del transactions_shops_categories
del transactions_shops_categories_blocks
gc.collect()

245

In [178]:
###
#DEBUG
###


pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
#transactions.sample(10).sort_values(by=['item_units'], ascending=False)
#transactions[transactions['item_category_id'] == 58].sample(10).sort_values(by=['total_sales_units'], ascending=False)

In [180]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [181]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [182]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [183]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num
0,0.0,54.0,12.0
1,0.0,54.0,13.0
2,0.0,54.0,14.0
3,0.0,54.0,15.0
4,0.0,54.0,16.0


In [185]:
all_combos['item_id'] = pd.to_numeric(all_combos['item_id'], downcast='unsigned')
all_combos['shop_id'] = pd.to_numeric(all_combos['shop_id'], downcast='unsigned')
all_combos['date_block_num'] = pd.to_numeric(all_combos['date_block_num'], downcast='unsigned')

In [186]:
len(all_combos)

8333930

In [187]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [188]:
dates = transactions[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
dates_dict

{20: {'month': 9, 'year': 2014},
 15: {'month': 4, 'year': 2014},
 18: {'month': 7, 'year': 2014},
 19: {'month': 8, 'year': 2014},
 21: {'month': 10, 'year': 2014},
 22: {'month': 11, 'year': 2014},
 23: {'month': 12, 'year': 2014},
 24: {'month': 1, 'year': 2015},
 27: {'month': 4, 'year': 2015},
 25: {'month': 2, 'year': 2015},
 12: {'month': 1, 'year': 2014},
 14: {'month': 3, 'year': 2014},
 16: {'month': 5, 'year': 2014},
 17: {'month': 6, 'year': 2014},
 13: {'month': 2, 'year': 2014},
 26: {'month': 3, 'year': 2015},
 28: {'month': 5, 'year': 2015},
 29: {'month': 6, 'year': 2015},
 30: {'month': 7, 'year': 2015},
 31: {'month': 8, 'year': 2015},
 32: {'month': 9, 'year': 2015},
 33: {'month': 10, 'year': 2015}}

In [189]:
all_combos['month'] = pd.to_numeric(all_combos['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')
all_combos['year'] = pd.to_numeric(all_combos['date_block_num'].apply(lambda block: dates_dict[block]['year']), downcast='unsigned')

In [190]:
transactions_items_columns = ['item_id', 'item_first_block',
       'item_last_block', 'is_first_two_blocks', 'is_last_two_blocks',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item_first_day',
       'item_last_day', 'item_activity_on_all_blocks', 'item_mean_price',
       'item_min_price', 'item_max_price', 'item_number_different_prices',
       'item_price_amplitude', 'category_mean_price',
       'item_deviation_mean_category_price',
       'item_first_two_blocks_units', 'item_last_two_blocks_units',
       'item_fluctuation_units_first_last_blocks',
       'item_first_two_blocks_mean_price',
       'item_last_two_blocks_mean_price',
       'item_fluctuation_price_first_last_blocks',
       'item_share_of_total_units', 'item_share_of_total_turnover',
       'category_units', 'item_share_of_category_units',
       'category_turnover', 'item_share_of_category_turnover']

In [195]:
transactions_items

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_category_id,y,turnover,item_first_block,item_last_block,is_first_two_blocks,is_last_two_blocks,subcategory,category_units,category_mean_units_block,category_day_units,category_mean_units_day,category_max_units_block,category_min_units_block,category_max_units_day,category_min_units_day,category_turnover,category_mean_turnover_block,category_day_turnover,category_mean_turnover_day,category_max_turnover_block,category_min_turnover_block,category_max_turnover_day,category_min_turnover_day,category_mean_price,category_min_price,category_max_price,category_first_two_blocks_units,category_last_two_blocks_units,category_fluctuation_units_first_last_blocks,category_first_two_blocks_mean_price,category_last_two_blocks_mean_price,category_fluctuation_price_first_last_blocks,video_game,gaming_old_gen,gaming_new_gen,pc_games,payment_cards,movies,movies_niche,books,music,music_CD,music_vinyl,gifts,software,subcategory_units,subcategory_mean_units_block,subcategory_day_units,subcategory_mean_units_day,subcategory_max_units_block,subcategory_min_units_block,subcategory_max_units_day,subcategory_min_units_day,subcategory_turnover,subcategory_mean_turnover_block,subcategory_day_turnover,subcategory_mean_turnover_day,subcategory_max_turnover_block,subcategory_min_turnover_block,subcategory_max_turnover_day,subcategory_min_turnover_day,category_share_of_total_units,category_share_of_total_turnover,subcategory_share_of_total_units,subcategory_share_of_total_turnover,subcategory_first_two_blocks_units,subcategory_last_two_blocks_units,subcategory_fluctuation_units_first_last_blocks,subcategory_first_two_blocks_mean_price,subcategory_last_two_blocks_mean_price,subcategory_fluctuation_price_first_last_blocks
0,0,01.09.2014,20,54,58,1,1,9,2014,40,1,58,20,20,False,True,Cinema - DVD,303281,15002.385742,348,556.335815,22065,6779,1472,128,82941753,4.084407e+06,94904,152254.968750,5848777,1927394,391497,34996,264.395996,11,1399,96528.0,23415,-75.742790,317.183877,257.053302,-18.957640,False,False,False,False,False,True,False,False,False,False,False,False,False,303281,15002.385742,348,556.335815,22065,6779.0,1472,128,82941753,4.084407e+06,94904,152254.968750,5848777,1927394,391497,34996,14.542552,1.833400,14.542552,1.833400,96528,23415,-75.742790,317.183877,257.053302,-18.957640
1,1,04.04.2014,15,55,4490,1,4,4,2014,76,2,4490,15,21,False,False,Software,4641,232.028351,12,8.861732,407,121,34,1,6574642,3.372164e+05,14388,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,73,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,124777,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
2,1,02.04.2014,15,55,4490,1,2,4,2014,76,2,4490,15,21,False,False,Software,4641,232.028351,4,8.861732,407,121,34,1,6574642,3.372164e+05,7230,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,61,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,116552,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
3,1,06.07.2014,18,55,4490,1,6,7,2014,76,1,4490,15,21,False,False,Software,4641,232.028351,6,8.861732,407,121,34,1,6574642,3.372164e+05,13439,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,49,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,80305,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
4,1,04.08.2014,19,55,4490,1,4,8,2014,76,1,4490,15,21,False,False,Software,4641,232.028351,13,8.861732,407,121,34,1,6574642,3.372164e+05,16497,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,59,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,139122,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
5,1,11.09.2014,20,55,4490,1,11,9,2014,76,1,4490,15,21,False,True,Software,4641,232.028351,7,8.861732,407,121,34,1,6574642,3.372164e+05,13640,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,65,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,124196,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
6,1,20.10.2014,21,55,4490,1,20,10,2014,76,1,4490,15,21,False,True,Software,4641,232.028351,7,8.861732,407,121,34,1,6574642,3.372164e+05,14419,13099.965820,558905,109064,93176,520,1642.940186,99,27900,685.0,817,19.270073,1667.660377,1998.585516,19.843678,False,False,False,False,False,False,False,False,False,False,False,False,True,38738,1891.442505,51,64.533173,2846,1049.0,146,16,66291241,3.219490e+06,118434,109660.703125,4667692,1899009,228112,22306,0.222539,0.301408,1.857516,1.070073,6214,3935,-36.675251,1642.332946,1942.891737,18.300722
7,2,24.08.2014,19,54,58,1,24,8,2014,40,1,58,19,22,False,False,Cinema - DVD,303281,15002.385742,632,556.335815,22065,6779,1472,128,82941753,4.084407e+06,182757,152254.968750,5848777,1927394,391497,34996,264.395996,11,1399,96528.0,23415,-75.742790,317.183877,257.053302,-18.957640,False,False,False,False,False,True,False,False,False,False,False,False,False,303281,15002.385742,632,556.335815,22065,6779.0,1472,128,82941753,4.084407e+06,182757,152254.968750,5848777,1927394,391497,34996,14.542552,1.833400,14.542552,1.833400,96528,23415,-75.742790,317.183877,257.053302,-18.957640
8,2,12.11.2014,22,54,58,1,12,11,2014,40,1,58,19,22,False,True,Cinema - DVD,303281,15002.385742,267,556.335815,22065,6779,1472,128,82941753,4.084407e+06,69958,152254.968750,5848777,1927394,391497,34996,264.395996,11,1399,96528.0,23415,-75.742790,317.183877,257.053302,-18.957640,False,False,False,False,False,True,False,False,False,False,False,False,False,303281,15002.385742,267,556.335815,22065,6779.0,1472,128,82941753,4.084407e+06,69958,152254.968750,5848777,1927394,391497,34996,14.542552,1.833400,14.542552,1.833400,96528,23415,-75.742790,317.183877,257.053302,-18.957640
9,3,05.07.2014,18,54,100,1,5,7,2014,40,1,100,18,19,False,True,Cinema - DVD,303281,15002.385742,630,556.335815,22065,6779,1472,128,82941753,4.084407e+06,178180,152254.968750,5848777,1927394,391497,34996,264.395996,11,1399,96528.0,23415,-75.742790,317.183877,257.053302,-18.957640,False,False,False,False,False,True,False,False,False,False,False,False,False,303281,15002.385742,630,556.335815,22065,6779.0,1472,128,82941753,4.084407e+06,178180,152254.968750,5848777,1927394,391497,34996,14.542552,1.833400,14.542552,1.833400,96528,23415,-75.742790,317.183877,257.053302,-18.957640


In [221]:
#del training
transactions_items = pd.read_pickle("pickled/transactions_items")
training = pd.merge(, transactions_items[transactions_items_columns].drop_duplicates('item_id'), on=['item_id'], how='left', copy=False)

del transactions_items
gc.collect()

35

In [223]:
transactions_items = pd.read_pickle("pickled/transactions_items")
transactions_items.item_longest_stretch_block_with_activity.dtype

dtype('uint8')

In [73]:
transactions_items_blocks_columns = ['item_id', 'date_block_num', 'item_block_units', 'item_block_turnover', 'item_mean_price_block']

In [74]:
gc.collect()
training = pd.merge(training, transactions_items_blocks[transactions_items_blocks_columns]\
                    .drop_duplicates(['item_id', 'date_block_num']), on=['item_id','date_block_num'], how='left', copy=False)

In [267]:
a = pd.merge(transactions_items[transactions_items_columns]\
                    .drop_duplicates(['item_id']),all_combos, on=['item_id'], how='left', copy=False)

In [270]:
len(a)

7984658

In [261]:
training = pd.DataFrame()
for column in transactions_items_columns:
    training[column]=np.nan
    training[column]=.astype(transactions_items_columns[column].dtype)



TypeError: list indices must be integers or slices, not str

In [256]:

training['item_first_day'] = pd.to_numeric(training['item_first_day'].astype(int),downcast='unsigned', errors='coerce')


ValueError: Cannot convert non-finite values (NA or inf) to integer

In [249]:
training['item_first_day'].dtype

dtype('float32')

In [257]:
transactions_items['item_first_day'].dtype

dtype('uint16')

In [253]:
np.nan.as_type('int')

AttributeError: 'float' object has no attribute 'as_type'

In [77]:
transactions_categories_columns = [
       'item_category_id',
       'category_units', 'category_mean_units_block',
       'category_day_units', 'category_mean_units_day',
       'category_max_units_block', 'category_min_units_block',
       'category_max_units_day', 'category_min_units_day',
       'category_turnover', 'category_mean_turnover_block',
       'category_day_turnover', 'category_mean_turnover_day',
       'category_max_turnover_block', 'category_min_turnover_block',
       'category_max_turnover_day', 'category_min_turnover_day',
       'category_mean_price', 'category_min_price', 'category_max_price',
       'category_first_two_blocks_units',
       'category_last_two_blocks_units',
       'category_fluctuation_units_first_last_blocks',
       'category_first_two_blocks_mean_price',
       'category_last_two_blocks_mean_price',
       'category_fluctuation_price_first_last_blocks', 'subcategory',
       'video_game', 'gaming_old_gen', 'gaming_new_gen', 'pc_games',
       'payment_cards', 'movies', 'movies_niche', 'books', 'music',
       'music_CD', 'music_vinyl', 'gifts', 'software',
       'subcategory_units',
       'subcategory_mean_units_block', 'subcategory_day_units',
       'subcategory_mean_units_day', 'subcategory_max_units_block',
       'subcategory_min_units_block', 'subcategory_max_units_day',
       'subcategory_min_units_day', 'subcategory_turnover', 'subcategory_mean_turnover_block',
       'subcategory_day_turnover', 'subcategory_mean_turnover_day',
       'subcategory_max_turnover_block', 'subcategory_min_turnover_block',
       'subcategory_max_turnover_day', 'subcategory_min_turnover_day',
       'category_share_of_total_units',
       'category_share_of_total_turnover',
       'subcategory_share_of_total_units',
       'subcategory_share_of_total_turnover',
       'subcategory_first_two_blocks_units',
       'subcategory_last_two_blocks_units',
       'subcategory_fluctuation_units_first_last_blocks',
       'subcategory_first_two_blocks_mean_price',
       'subcategory_last_two_blocks_mean_price',
       'subcategory_fluctuation_price_first_last_blocks']


In [78]:
gc.collect()
training = pd.merge(training, transactions_categories[transactions_categories_columns]\
                    .drop_duplicates('item_category_id'), on=['item_category_id'], how='left', copy=False)

In [79]:
transactions_categories_blocks_columns = ['item_category_id', 'date_block_num', 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block', 'subcategory_block_units',
 'subcategory_block_turnover',
 'subcategory_mean_price_block']

In [80]:
gc.collect()
training = pd.merge(training, transactions_categories_blocks[transactions_categories_blocks_columns]\
                    .drop_duplicates(['item_category_id', 'date_block_num']), on=['item_category_id','date_block_num'], how='left', copy=False)

In [83]:
transactions_shops_columns = ['shop_id', 
       'shop_units', 'shop_mean_units_block', 'shop_day_units',
       'shop_mean_units_day', 'shop_max_units_block',
       'shop_min_units_block', 'shop_max_units_day', 'shop_min_units_day',
       'shop_turnover', 'shop_mean_turnover_block', 'shop_day_turnover',
       'shop_mean_turnover_day', 'shop_max_turnover_block',
       'shop_min_turnover_block', 'shop_max_turnover_day',
       'shop_min_turnover_day', 'shop_mean_price',
       'shop_first_two_blocks_units', 'shop_last_two_blocks_units',
       'shop_fluctuation_units_first_last_blocks',
       'shop_first_two_blocks_mean_price',
       'shop_last_two_blocks_mean_price',
       'shop_fluctuation_price_first_last_blocks', 'shop_share_of_units',
       'shop_share_of_turnover', 'shop_TC', 'shop_TRK', 'shop_SEC',
       'shop_shopping_center', 'shop_moscow', 'max_category_units',
       'max_category_turnover', 'area', 'area_units',
       'area_mean_units_block', 'area_day_units', 'area_mean_units_day',
       'area_max_units_block', 'area_min_units_block',
       'area_max_units_day', 'area_min_units_day', 'area_turnover',
     'area_mean_turnover_block',
       'area_day_turnover', 'area_mean_turnover_day',
       'area_max_turnover_block', 'area_min_turnover_block',
       'area_max_turnover_day', 'area_min_turnover_day',
       'area_mean_price',
       'area_first_two_blocks_units', 'area_last_two_blocks_units',
       'area_fluctuation_units_first_last_blocks',
       'area_first_two_blocks_mean_price',
       'area_last_two_blocks_mean_price',
       'area_fluctuation_price_first_last_blocks']

In [84]:
gc.collect()
training = pd.merge(training, transactions_shops[transactions_shops_columns]\
                    .drop_duplicates('shop_id'), on=['shop_id'], how='left', copy=False)

In [85]:
transactions_shops_blocks_columns = ['shop_id', 'date_block_num',  'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block', 'area_block_units',
 'area_block_turnover',
 'area_mean_price_block']

In [86]:
gc.collect()
training = pd.merge(training, transactions_shops_blocks[transactions_shops_blocks_columns]\
                    .drop_duplicates(['shop_id', 'date_block_num']), on=['shop_id', 'date_block_num'], how='left', copy=False)

In [87]:
transactions_shops_categories_columns = [ 'shop_id', 
       'item_category_id',
       'shop_category_units', 'shop_category_mean_units_block',
       'shop_category_day_units', 'shop_category_mean_units_day',
       'shop_category_max_units_block', 'shop_category_min_units_block',
       'shop_category_max_units_day', 'shop_category_min_units_day',
       'shop_category_turnover', 'shop_category_mean_turnover_block',
       'shop_category_day_turnover', 'shop_category_mean_turnover_day',
       'shop_category_max_turnover_block',
       'shop_category_min_turnover_block',
       'shop_category_max_turnover_day', 'shop_category_min_turnover_day',
       'shop_category_mean_price']

In [88]:
gc.collect()
training = pd.merge(training, transactions_shops_categories[transactions_shops_categories_columns]\
                    .drop_duplicates(['shop_id','item_category_id']), on=['shop_id','item_category_id'], how='left', copy=False)

In [89]:
transactions_shops_categories_blocks_columns = ['shop_id', 'item_category_id', 'date_block_num',   'shop_category_block_units',
 'shop_category_block_turnover',
 'shop_category_mean_price_block']

In [90]:
gc.collect()
training = pd.merge(training, transactions_shops_categories_blocks[transactions_shops_categories_blocks_columns]\
                    .drop_duplicates(['shop_id','item_category_id','date_block_num']), \
                    on=['shop_id','item_category_id','date_block_num'], how='left', copy=False)

In [91]:
len(training)

8333930

In [92]:
gc.collect()
training = pd.merge(training, transactions[['item_id','shop_id','date_block_num','y']]\
                    .drop_duplicates(['item_id','shop_id','date_block_num']),\
                    on=['item_id','shop_id','date_block_num'], how='left', copy=False)

In [93]:
gc.collect()
training.fillna(0,inplace=True)

In [1]:
import pickle as pickle

#pickle.dump(training, open( "training", "wb"), protocol=4)

training = pickle.load( open( "training", "rb" ) )

In [94]:
lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block',
 'subcategory_block_units',
 'subcategory_block_turnover',
 'subcategory_mean_price_block',
 'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block',
 'area_block_units',
 'area_block_turnover',
 'area_mean_price_block',
 'shop_category_block_units',
 'shop_category_turnover',
 'shop_category_mean_price_block'
 ]

In [97]:
lags = [1,2,3]

In [100]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 ]

merge_columns = ['lagged_block','item_id']

for lag in lags:
    print(lag)
    lagged = training[['date_block_num', 'item_id']+lag_columns].copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    training['lagged_block'] = training['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    training = pd.merge(training,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1


MemoryError: 

In [None]:
TODO: month & year 1hot