In [375]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [376]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb

In [377]:
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [378]:
transactions = sales_train
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)


In [379]:
transactions = transactions.set_index('item_id').join(items.set_index('item_id'))
transactions.reset_index(inplace=True)
transactions = transactions[transactions['year'] != 2013]
transactions['y'] = transactions.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].transform('sum').clip(0,20)


In [380]:
print(len(transactions))
transactions.head()

1668287


Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y
0,0,01.09.2014,20,54,58.0,1.0,1,9,2014,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,1.0
1,1,04.04.2014,15,55,4490.0,1.0,4,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
2,1,02.04.2014,15,55,4490.0,1.0,2,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
3,1,06.07.2014,18,55,4490.0,1.0,6,7,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0
4,1,04.08.2014,19,55,4490.0,1.0,4,8,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0


In [381]:
len(transactions)

1668287

In [382]:
transactions['turnover'] = transactions['item_price'] * transactions['item_cnt_day']

In [383]:
transactions['item_first_block'] = transactions.groupby('item_id')['date_block_num'].transform(np.min)
transactions['item_last_block'] = transactions.groupby('item_id')['date_block_num'].transform(np.max)

transactions['is_first_two_blocks'] = transactions['date_block_num'].isin([transactions['item_first_block']+1,transactions['item_first_block']+2])
transactions['is_last_two_blocks'] = transactions['date_block_num'].isin([transactions['item_last_block']-1,transactions['item_last_block']])


In [384]:
number_of_items = transactions['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = transactions['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = transactions['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = transactions['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = transactions['item_cnt_day'].sum()
print("total_sales:", total_sales)
total_turnover = transactions['turnover'].sum()
print("total_turnover:", total_turnover)
average_price = transactions['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
total_turnover: 2181401610.589987
average_price: 1015.5023073770728


#ITEM

-UNITS
item_units
item_block_units
item_mean_units_block
item_day_units
item_mean_units_day
item_max_units_block
item_min_units_block
item_max_units_day
item_min_units_day

-TURNOVER
item_turnover
item_block_turnover
item_mean_turnover_block
item_day_turnover
item_mean_turnover_day
item_max_turnover_block
item_min_turnover_block
item_max_turnover_day
item_min_turnover_day


-TIME
item_days_of_activity
item_blocks_of_activity
item_mean_day_between_activity
item_longest_stretch_days_without_activity
item_longest_stretch_blocks_without_activity
item_longest_stretch_block_with_activity
item_number_of_consecutive_days_with_activity
item_days_between_start_and_first_activity
item_blocks_between_start_and_first_activity
item_first_block
item_last_block
item_first_day
item_last_day
item_activity_on_all_blocks


-PRICE
item_mean_price
item_mean_price_block
item_min_price
item_max_price
item_number_different_prices
item_price_amplitude (%age min/max)
item_deviation_mean_category_price


-TREND
is_first_two_full_blocks (actually second/third to make sure we have a "full" block if this was a new release !!!!
is_last_two_blocks
item_first_two_blocks_units
item_last_two_blocks_units
item_fluctuation_units_first_last_blocks
item_first_two_blocks_mean_price
item_last_two_blocks_mean_price
item_fluctuation_price_first_last_blocks

-ENCODINGS
item_share_of_total_units
item_share_of_total_gross
item_share_of_category_units
item_share_of_category_turnover

In [439]:
gc.collect()
transactions_items = transactions.copy()
transactions_items_blocks = transactions.copy()

In [440]:
transactions_items_blocks['item_block_units'] = transactions_items_blocks.groupby(['item_id','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_items_blocks['item_block_turnover'] = transactions_items_blocks.groupby(['item_id','date_block_num'])['turnover'].transform(np.sum)
transactions_items_blocks['item_mean_price_block'] = transactions_items_blocks.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean)

In [443]:
transactions_items['item_units'] = transactions_items.groupby(['item_id'])['item_cnt_day'].transform(np.sum)
transactions_items['item_mean_units_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.mean)
transactions_items['item_day_units'] = transactions_items.groupby(['item_id','date'])['item_cnt_day'].transform(np.sum)
transactions_items['item_mean_units_day'] = transactions_items.groupby(['item_id'])['item_day_units'].transform(np.mean)
transactions_items['item_max_units_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.max)
transactions_items['item_min_units_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_units'].transform(np.min)
transactions_items['item_max_units_day'] = transactions_items.groupby(['item_id'])['item_day_units'].transform(np.max)
transactions_items['item_min_units_day'] = transactions_items.groupby(['item_id'])['item_day_units'].transform(np.min)

In [444]:
transactions_items['item_turnover'] = transactions_items.groupby(['item_id'])['turnover'].transform(np.sum)
transactions_items['item_mean_turnover_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.mean)
transactions_items['item_day_turnover'] = transactions_items.groupby(['item_id','date'])['turnover'].transform(np.sum)
transactions_items['item_mean_turnover_day'] = transactions_items.groupby(['item_id'])['turnover'].transform(np.mean)
transactions_items['item_max_turnover_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.max)
transactions_items['item_min_turnover_block'] = transactions_items_blocks.groupby(['item_id'])['item_block_turnover'].transform(np.min)
transactions_items['item_max_turnover_day'] = transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.max)
transactions_items['item_min_turnover_day'] = transactions_items.groupby(['item_id'])['item_day_turnover'].transform(np.min)

In [445]:
transactions_items['item_days_of_activity'] = transactions_items.groupby(['item_id'])['date'].transform("nunique")
transactions_items['item_blocks_of_activity'] = transactions_items.groupby(['item_id'])['date_block_num'].transform("nunique")

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

transactions_items['item_days_since_start'] = transactions_items.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1)

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

transactions_items['item_mean_day_between_activity'] = transactions_items['item_id'].map(average_days_between_sales)


def get_max_stretch_without_sales_days(days):
    days = np.unique(days)
    max_stretch = 0
    len_days = len(days)
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return max_stretch
        next_day = days[index+1]
        stretch = next_day - day
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales_day = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_max_stretch_without_sales_days(x))

transactions_items['item_longest_stretch_days_without_activity'] = transactions_items['item_id'].map(max_stretch_without_sales_day)

In [446]:
gc.collect()

def get_max_stretch_without_sales_block(blocks):
    blocks = np.unique(blocks)
    max_stretch = 0
    len_blocks = len(blocks)
    for index,block in enumerate(sorted(blocks)):
        if index == len_blocks - 1:
            return max_stretch
        next_block = blocks[index+1]
        stretch = next_block - block
        if stretch > max_stretch:
            max_stretch = stretch
            

        
item_longest_stretch_blocks_without_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_block(x))

transactions_items['item_longest_stretch_blocks_without_activity'] = transactions_items['item_id'].map(item_longest_stretch_blocks_without_activity)



def get_longest_stretch(following_pairs, n=1,new_n=1):
    #print("following_pairs", following_pairs, " n: ", n, " new_n: ", new_n)
    len_pairs = len(following_pairs)
    if len_pairs == 0:
        return 0
    if len_pairs == 1:
        if new_n > n:
            return new_n
        return n
    if following_pairs[1][0] == following_pairs[0][1]:
        new_n+=1
    else:
        if new_n > n:
            n=new_n
        new_n=1
    return get_longest_stretch(following_pairs[1:], n,new_n)


assert(get_longest_stretch([]) == 0)
assert(get_longest_stretch([[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[-1, 0],[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[20, 21], [25,26]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16],[18,19] ,[22,23], [23,24],[24, 25]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5], [7, 8]]) == 1)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [20, 21], [21, 22], [22,23],[23,24]]) == 4)


def get_following_pairs(pairs):
    pairs = np.unique(pairs)
    len_pairs = len(pairs)
    following = []
    for index,pair in enumerate(sorted(pairs)):
        if index == len_pairs - 1:
            return following
        next_pair = pairs[index+1]
        if next_pair == pair + 1:
            following.append([pair, next_pair])
        
assert(get_following_pairs([1,2,5,6,7,8,9,11,12,15]) == [[1, 2], [5, 6], [6, 7], [7, 8], [8, 9], [11, 12]])
assert(get_following_pairs([1,2,5,6,7,10]) == [[1, 2], [5, 6], [6, 7]])
assert(get_following_pairs([1,2,4,5,7,9,10]) == [[1, 2], [4, 5], [9,10]])
assert(get_following_pairs([1,2,4,5,7,9,10,11,12,15]) == [[1, 2], [4, 5], [9,10],[10,11],[11,12]])


item_longest_stretch_block_with_activity = transactions_items.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions_items['item_longest_stretch_block_with_activity'] = transactions_items['item_id'].map(item_longest_stretch_block_with_activity)


item_number_of_consecutive_days_with_activity = transactions_items.groupby(['item_id'])['item_days_since_start']\
                                    .apply(list).apply(lambda x: len(get_following_pairs(x)))
    
transactions_items['item_number_of_consecutive_days_with_activity'] = transactions_items['item_id'].map(item_number_of_consecutive_days_with_activity)

In [447]:
def get_units_between_first_and_last(units):
    return np.max(units) - np.min(units)

item_days_between_start_and_first_activity = transactions_items.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_days_between_start_and_first_activity'] = transactions_items['item_id'].map(item_days_between_start_and_first_activity)

item_blocks_between_start_and_first_activity = transactions_items.groupby(['item_id'])['date_block_num'].apply(list).apply(lambda x: get_units_between_first_and_last(x))
transactions_items['item_blocks_between_start_and_first_activity'] = transactions_items['item_id'].map(item_blocks_between_start_and_first_activity)

In [448]:

transactions_items['item_first_day'] = transactions_items.groupby('item_id')['item_days_since_start'].transform(np.min)
transactions_items['item_last_day'] = transactions_items.groupby('item_id')['item_days_since_start'].transform(np.max)

item_activity_on_all_blocks = transactions_items.groupby('item_id')['date_block_num'].nunique().apply(lambda x: x==number_of_blocks)
transactions_items['item_activity_on_all_blocks'] = transactions_items['item_id'].map(item_activity_on_all_blocks)

In [449]:
transactions_items['item_mean_price'] = transactions_items.groupby('item_id')['item_price'].transform(np.mean)
transactions_items['item_min_price'] = transactions_items.groupby('item_id')['item_price'].transform(np.min)
transactions_items['item_max_price'] = transactions_items.groupby('item_id')['item_price'].transform(np.max)
transactions_items['item_number_different_prices'] = transactions_items.groupby('item_id')['item_price'].transform('nunique')
transactions_items['item_price_amplitude'] = ((transactions_items['item_max_price'] - transactions_items['item_min_price'] ) / transactions_items['item_min_price']) * 100
transactions_items['category_mean_price'] = transactions_items.groupby('item_category_id')['item_price'].transform(np.mean)
transactions_items['item_deviation_mean_category_price'] =  ((transactions_items['item_mean_price'] - transactions_items['category_mean_price'] ) / transactions_items['category_mean_price']) * 100

In [450]:
item_first_two_blocks_units = transactions_items.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_first_two_blocks_units = item_first_two_blocks_units[item_first_two_blocks_units['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_first_two_blocks_units'] = transactions_items['item_id'].map(item_first_two_blocks_units)

item_last_two_blocks_units = transactions_items.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
item_last_two_blocks_units = item_last_two_blocks_units[item_last_two_blocks_units['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_last_two_blocks_units'] = transactions_items['item_id'].map(item_last_two_blocks_units)

transactions_items['item_fluctuation_units_first_last_blocks'] =  ((transactions_items['item_first_two_blocks_units'] - transactions_items['item_last_two_blocks_units'] ) / \
                                                             transactions_items['item_first_two_blocks_units']) * 100 * -1


item_first_two_blocks_mean_price = transactions_items.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
item_first_two_blocks_mean_price = item_first_two_blocks_mean_price[item_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_first_two_blocks_mean_price'] = transactions_items['item_id'].map(item_first_two_blocks_mean_price)

item_last_two_blocks_mean_price = transactions_items.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
item_last_two_blocks_mean_price = item_last_two_blocks_mean_price[item_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]
transactions_items['item_last_two_blocks_mean_price'] = transactions_items['item_id'].map(item_last_two_blocks_mean_price)

transactions_items['item_fluctuation_price_first_last_blocks'] =  ((transactions_items['item_first_two_blocks_mean_price'] - transactions_items['item_last_two_blocks_mean_price'] ) / \
                                                             transactions_items['item_first_two_blocks_mean_price'])  * 100 * -1

In [451]:
transactions_items['item_share_of_total_units'] = transactions_items['item_units'] * 100 / total_sales 

transactions_items['item_share_of_total_turnover'] = transactions_items['item_turnover'] * 100 / total_turnover

transactions_items['category_units'] = transactions_items.groupby('item_category_id')['item_cnt_day'].transform(np.sum)
transactions_items['item_share_of_category_units'] = transactions_items['item_units'] * 100 / transactions_items['category_units']

transactions_items['category_turnover'] = transactions_items.groupby('item_category_id')['turnover'].transform(np.sum)
transactions_items['item_share_of_category_turnover'] = transactions_items['item_turnover'] * 100 / transactions_items['category_turnover']


#CATEGORY

-UNITS
category_units
category_block_units
category_mean_units_block
category_day_units
category_mean_units_day
category_max_units_block
category_min_units_block
category_max_units_day
category_min_units_day

-TURNOVER
category_turnover
category_block_turnover
category_mean_turnover_block
category_day_turnover
category_mean_turnover_day
category_max_turnover_block
category_min_turnover_block
category_max_turnover_day
category_min_turnover_day


-PRICE
category_mean_price
category_mean_price_block
category_min_price
category_max_price


-TREND
category_first_two_blocks_units
category_last_two_blocks_units
category_fluctuation_units_first_last_blocks
category_first_two_blocks_mean_price
category_last_two_blocks_mean_price
category_fluctuation_price_first_last_blocks

-SUBCATEGORY
subcategory
subcategory 1hot

-UNITS
subcategory_units
subcategory_block_units
subcategory_mean_units_block
subcategory_day_units
subcategory_mean_units_day
subcategory_max_units_block
subcategory_min_units_block
subcategory_max_units_day
subcategory_min_units_day

-TURNOVER
subcategory_turnover
subcategory_block_turnover
subcategory_mean_turnover_block
subcategory_day_turnover
subcategory_mean_turnover_day
subcategory_max_turnover_block
subcategory_min_turnover_block
subcategory_max_turnover_day
subcategory_min_turnover_day

-ENCODINGS
category_share_of_total_units
category_share_of_total_gross
subcategory_share_of_total_units
subcategory_share_of_total_gross

-TREND
subcategory_first_two_blocks_units
subcategory_last_two_blocks_units
subcategory_fluctuation_units_first_last_blocks
subcategory_first_two_blocks_mean_price
subcategory_last_two_blocks_mean_price
subcategory_fluctuation_price_first_last_blocks

In [452]:
gc.collect()
transactions_categories = transactions.copy()
transactions_categories_blocks = transactions.copy()

In [453]:
transactions_categories_blocks['category_block_units'] = transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_categories_blocks['category_block_turnover'] = transactions_categories_blocks.groupby(['item_category_id','date_block_num'])['turnover'].transform(np.sum)
transactions_categories_blocks['category_mean_price_block'] = transactions_categories_blocks.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean)


In [454]:
transactions_categories['category_units'] = transactions_categories.groupby(['item_category_id'])['item_cnt_day'].transform(np.sum)
transactions_categories['category_mean_units_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.mean)
transactions_categories['category_day_units'] = transactions_categories.groupby(['item_category_id','date'])['item_cnt_day'].transform(np.sum)
transactions_categories['category_mean_units_day'] = transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.mean)
transactions_categories['category_max_units_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.max)
transactions_categories['category_min_units_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_units'].transform(np.min)
transactions_categories['category_max_units_day'] = transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.max)
transactions_categories['category_min_units_day'] = transactions_categories.groupby(['item_category_id'])['category_day_units'].transform(np.min)

In [455]:
transactions_categories['category_turnover'] = transactions_categories.groupby(['item_category_id'])['turnover'].transform(np.sum)
transactions_categories['category_mean_turnover_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.mean)
transactions_categories['category_day_turnover'] = transactions_categories.groupby(['item_category_id','date'])['turnover'].transform(np.sum)
transactions_categories['category_mean_turnover_day'] = transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.mean)
transactions_categories['category_max_turnover_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.max)
transactions_categories['category_min_turnover_block'] = transactions_categories_blocks.groupby(['item_category_id'])['category_block_turnover'].transform(np.min)
transactions_categories['category_max_turnover_day'] = transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.max)
transactions_categories['category_min_turnover_day'] = transactions_categories.groupby(['item_category_id'])['category_day_turnover'].transform(np.min)

In [456]:
transactions_categories['category_mean_price'] = transactions_categories.groupby('item_category_id')['item_price'].transform(np.mean)
transactions_categories['category_min_price'] = transactions_categories.groupby('item_category_id')['item_price'].transform(np.min)
transactions_categories['category_max_price'] = transactions_categories.groupby('item_category_id')['item_price'].transform(np.max)

In [457]:
category_first_two_blocks_units = transactions_categories.groupby(['item_category_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
category_first_two_blocks_units = category_first_two_blocks_units[category_first_two_blocks_units['is_first_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_first_two_blocks_units'] = transactions_categories['item_category_id'].map(category_first_two_blocks_units)

category_last_two_blocks_units = transactions_categories.groupby(['item_category_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
category_last_two_blocks_units = category_last_two_blocks_units[category_last_two_blocks_units['is_last_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_last_two_blocks_units'] = transactions_categories['item_category_id'].map(category_last_two_blocks_units)

transactions_categories['category_fluctuation_units_first_last_blocks'] =  ((transactions_categories['category_first_two_blocks_units'] - transactions_categories['category_last_two_blocks_units'] ) / \
                                                             transactions_categories['category_first_two_blocks_units']) * 100 * -1


category_first_two_blocks_mean_price = transactions_categories.groupby(['item_category_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
category_first_two_blocks_mean_price = category_first_two_blocks_mean_price[category_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_first_two_blocks_mean_price'] = transactions_categories['item_category_id'].map(category_first_two_blocks_mean_price)

category_last_two_blocks_mean_price = transactions_categories.groupby(['item_category_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
category_last_two_blocks_mean_price = category_last_two_blocks_mean_price[category_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('item_category_id').iloc[:,1]
transactions_categories['category_last_two_blocks_mean_price'] = transactions_categories['item_category_id'].map(category_last_two_blocks_mean_price)

transactions_categories['category_fluctuation_price_first_last_blocks'] =  ((transactions_categories['category_first_two_blocks_mean_price'] - transactions_categories['category_last_two_blocks_mean_price'] ) / \
                                                             transactions_categories['category_first_two_blocks_mean_price'])  * 100 * -1

In [458]:
sub_cats = {}
for i in range(1,8):
    sub_cats[i] = "Accessories"
sub_cats[8] = "Tickets"
sub_cats[9] = "Delivery of goods"
for i in range(10,18):
    sub_cats[i] = "Consoles"
for i in range(18,25):
    sub_cats[i] = "Game for Consoles"
sub_cats[25] = "Accessories for Games"
sub_cats[26] = "Android Games"
sub_cats[27] = "MAC Games"
for i in range(28,32):
    sub_cats[i] = "PC Games"
for i in range(32,37):
    sub_cats[i] = "Payment Cards"
for i in range(37,40):
    sub_cats[i] = "Cinema - Blu-ray"
sub_cats[40] = "Cinema - DVD"
sub_cats[41] = "Cinema - Collectible"
for i in range(42,46):
    sub_cats[i] = "Audiobooks"
for i in range(46,55):
    sub_cats[i] = "Books"
for i in range(55,57):
    sub_cats[i] = "Music - CD"
sub_cats[57] = "Music - MP3"
sub_cats[58] = "Music - Vinyl"
sub_cats[59] = "Music - Music Video"
sub_cats[60] = "Music - Gift Edition"
for i in range(61,74):
    sub_cats[i] = "Gifts"
for i in range(73,79):
    sub_cats[i] = "Software"
sub_cats[79] = "Utility"
for i in range(80,84):
    sub_cats[i] = "Misc"
    
    
transactions_categories['subcategory'] = transactions_categories['item_category_id'].apply(lambda x: sub_cats[x])

transactions_categories['video_game'] = transactions_categories["item_category_id"].isin(list(range(18,32)))
transactions_categories['gaming_old_gen'] = transactions_categories["item_category_id"].isin([10,11,15,18,19,23])
transactions_categories['gaming_new_gen'] = transactions_categories["item_category_id"].isin([12,14,16,20,22,24])
transactions_categories['pc_games'] = transactions_categories["item_category_id"].isin(list(range(27,32)))
transactions_categories['payment_cards'] = transactions_categories["item_category_id"].isin(list(range(32,37)))
transactions_categories['movies'] = transactions_categories["item_category_id"].isin(list(range(37,42)))
transactions_categories['movies_niche'] = transactions_categories["item_category_id"].isin([38,39])
transactions_categories['books'] = transactions_categories["item_category_id"].isin([42,55])
transactions_categories['music'] = transactions_categories["item_category_id"].isin(list(range(55,61)))
transactions_categories['music_CD'] = transactions_categories["item_category_id"].isin([55,56])
transactions_categories['music_vinyl'] = transactions_categories["item_category_id"].isin([58])
transactions_categories['gifts'] = transactions_categories["item_category_id"].isin(list(range(61,72)))
transactions_categories['software'] = transactions_categories["item_category_id"].isin(list(range(73,79)))

In [459]:
transactions_categories['subcategory_units'] = transactions_categories.groupby(['subcategory'])['item_cnt_day'].transform(np.sum)
transactions_categories['subcategory_block_units'] = transactions_categories.groupby(['subcategory','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_categories['subcategory_mean_units_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_units'].transform(np.mean)
transactions_categories['subcategory_day_units'] = transactions_categories.groupby(['subcategory','date'])['item_cnt_day'].transform(np.sum)
transactions_categories['subcategory_mean_units_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.mean)
transactions_categories['subcategory_max_units_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_units'].transform(np.max)
transactions_categories['subcategory_min_units_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_units'].transform(np.min)
transactions_categories['subcategory_max_units_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.max)
transactions_categories['subcategory_min_units_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_units'].transform(np.min)

In [460]:
transactions_categories['subcategory_turnover'] = transactions_categories.groupby(['subcategory'])['turnover'].transform(np.sum)
transactions_categories['subcategory_block_turnover'] = transactions_categories.groupby(['subcategory','date_block_num'])['turnover'].transform(np.sum)
transactions_categories['subcategory_mean_turnover_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.mean)
transactions_categories['subcategory_day_turnover'] = transactions_categories.groupby(['subcategory','date'])['turnover'].transform(np.sum)
transactions_categories['subcategory_mean_turnover_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.mean)
transactions_categories['subcategory_max_turnover_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.max)
transactions_categories['subcategory_min_turnover_block'] = transactions_categories.groupby(['subcategory'])['subcategory_block_turnover'].transform(np.min)
transactions_categories['subcategory_max_turnover_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.max)
transactions_categories['subcategory_min_turnover_day'] = transactions_categories.groupby(['subcategory'])['subcategory_day_turnover'].transform(np.min)

In [461]:
transactions_categories['category_share_of_total_units'] = transactions_categories['category_units'] * 100 / total_sales 
transactions_categories['category_share_of_total_turnover'] = transactions_categories['category_turnover']* 100 / total_turnover

transactions_categories['subcategory_units'] = transactions_categories.groupby("subcategory")['item_cnt_day'].transform(np.sum)
transactions_categories['subcategory_share_of_total_units'] = transactions_categories['subcategory_units'] * 100 / total_sales
transactions_categories['subcategory_turnover'] = transactions_categories.groupby("subcategory")['turnover'].transform(np.sum)
transactions_categories['subcategory_share_of_total_turnover'] = transactions_categories['subcategory_turnover']* 100 / total_turnover

In [462]:
subcategory_first_two_blocks_units = transactions_categories.groupby(['subcategory','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
subcategory_first_two_blocks_units = subcategory_first_two_blocks_units[subcategory_first_two_blocks_units['is_first_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_first_two_blocks_units'] = transactions_categories['subcategory'].map(subcategory_first_two_blocks_units)

subcategory_last_two_blocks_units = transactions_categories.groupby(['subcategory','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
subcategory_last_two_blocks_units = subcategory_last_two_blocks_units[subcategory_last_two_blocks_units['is_last_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_last_two_blocks_units'] = transactions_categories['subcategory'].map(subcategory_last_two_blocks_units)

transactions_categories['subcategory_fluctuation_units_first_last_blocks'] =  ((transactions_categories['subcategory_first_two_blocks_units'] - transactions_categories['subcategory_last_two_blocks_units'] ) / \
                                                             transactions_categories['subcategory_first_two_blocks_units']) * 100 * -1


subcategory_first_two_blocks_mean_price = transactions_categories.groupby(['subcategory','is_first_two_blocks'], as_index=False)['item_price'].mean()
subcategory_first_two_blocks_mean_price = subcategory_first_two_blocks_mean_price[subcategory_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_first_two_blocks_mean_price'] = transactions_categories['subcategory'].map(subcategory_first_two_blocks_mean_price)

subcategory_last_two_blocks_mean_price = transactions_categories.groupby(['subcategory','is_last_two_blocks'], as_index=False)['item_price'].mean()
subcategory_last_two_blocks_mean_price = subcategory_last_two_blocks_mean_price[subcategory_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('subcategory').iloc[:,1]
transactions_categories['subcategory_last_two_blocks_mean_price'] = transactions_categories['subcategory'].map(subcategory_last_two_blocks_mean_price)

transactions_categories['subcategory_fluctuation_price_first_last_blocks'] =  ((transactions_categories['subcategory_first_two_blocks_mean_price'] - transactions_categories['subcategory_last_two_blocks_mean_price'] ) / \
                                                             transactions_categories['subcategory_first_two_blocks_mean_price'])  * 100 * -1

#SHOP

-UNITS
shop_units
shop_block_units
shop_mean_units_block
shop_day_units
shop_mean_units_day
shop_max_units_block
shop_min_units_block
shop_max_units_day
shop_min_units_day

-TURNOVER
shop_turnover
shop_block_turnover
shop_mean_turnover_block
shop_day_turnover
shop_mean_turnover_day
shop_max_turnover_block
shop_min_turnover_block
shop_max_turnover_day
shop_min_turnover_day

-PRICE
shop_mean_price
shop_mean_price_block


-TREND
shop_first_two_blocks_units
shop_last_two_blocks_units
shop_fluctuation_units_first_last_blocks
shop_first_two_blocks_mean_price
shop_last_two_blocks_mean_price
shop_fluctuation_price_first_last_blocks

-ENCODINGS
shop_share_of_total_units
shop_share_of_total_gross

-MISC
shop_ids_TC
shop_ids_TRK
shop_ids_SEC
shop_ids_shopping_center
shop_ids_moscow

-CATEGORY
shop_top_category_units
shop_top_category_turnover
shop_top_subcategory_units
shop_top_subcategory_turnover

In [463]:
gc.collect()
transactions_shops = transactions.copy()
transactions_shops_blocks = transactions.copy()

In [464]:
transactions_shops_blocks['shop_block_units'] = transactions_shops_blocks.groupby(['shop_id','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_shops_blocks['shop_block_turnover'] = transactions_shops_blocks.groupby(['shop_id','date_block_num'])['turnover'].transform(np.sum)
transactions_shops_blocks['shop_mean_price_block'] = transactions_shops_blocks.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean)

In [465]:
transactions_shops['shop_units'] = transactions_shops.groupby(['shop_id'])['item_cnt_day'].transform(np.sum)
transactions_shops['shop_mean_units_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.mean)
transactions_shops['shop_day_units'] = transactions_shops.groupby(['shop_id','date'])['item_cnt_day'].transform(np.sum)
transactions_shops['shop_mean_units_day'] = transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.mean)
transactions_shops['shop_max_units_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.max)
transactions_shops['shop_min_units_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_units'].transform(np.min)
transactions_shops['shop_max_units_day'] = transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.max)
transactions_shops['shop_min_units_day'] = transactions_shops.groupby(['shop_id'])['shop_day_units'].transform(np.min)

In [466]:
transactions_shops['shop_turnover'] = transactions_shops.groupby(['shop_id'])['turnover'].transform(np.sum)
transactions_shops['shop_mean_turnover_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.mean)
transactions_shops['shop_day_turnover'] = transactions_shops.groupby(['shop_id','date'])['turnover'].transform(np.sum)
transactions_shops['shop_mean_turnover_day'] = transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.mean)
transactions_shops['shop_max_turnover_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.max)
transactions_shops['shop_min_turnover_block'] = transactions_shops_blocks.groupby(['shop_id'])['shop_block_turnover'].transform(np.min)
transactions_shops['shop_max_turnover_day'] = transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.max)
transactions_shops['shop_min_turnover_day'] = transactions_shops.groupby(['shop_id'])['shop_day_turnover'].transform(np.min)

In [467]:
transactions_shops['shop_mean_price'] = transactions_shops.groupby('shop_id')['item_price'].transform(np.mean)


In [468]:
shop_first_two_blocks_units = transactions_shops.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_first_two_blocks_units = shop_first_two_blocks_units[shop_first_two_blocks_units['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_first_two_blocks_units'] = transactions_shops['shop_id'].map(shop_first_two_blocks_units)

shop_last_two_blocks_units = transactions_shops.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
shop_last_two_blocks_units = shop_last_two_blocks_units[shop_last_two_blocks_units['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_last_two_blocks_units'] = transactions_shops['shop_id'].map(shop_last_two_blocks_units)

transactions_shops['shop_fluctuation_units_first_last_blocks'] =  ((transactions_shops['shop_first_two_blocks_units'] - transactions_shops['shop_last_two_blocks_units'] ) / \
                                                             transactions_shops['shop_first_two_blocks_units']) * 100 * -1


shop_first_two_blocks_mean_price = transactions_shops.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
shop_first_two_blocks_mean_price = shop_first_two_blocks_mean_price[shop_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_first_two_blocks_mean_price'] = transactions_shops['shop_id'].map(shop_first_two_blocks_mean_price)

shop_last_two_blocks_mean_price = transactions_shops.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
shop_last_two_blocks_mean_price = shop_last_two_blocks_mean_price[shop_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]
transactions_shops['shop_last_two_blocks_mean_price'] = transactions_shops['shop_id'].map(shop_last_two_blocks_mean_price)

transactions_shops['shop_fluctuation_price_first_last_blocks'] =  ((transactions_shops['shop_first_two_blocks_mean_price'] - transactions_shops['shop_last_two_blocks_mean_price'] ) / \
                                                             transactions_shops['shop_first_two_blocks_mean_price'])  * 100 * -1

In [469]:
transactions_shops['shop_share_of_units'] = transactions_shops['shop_units'] * 100 / total_sales
transactions_shops['shop_share_of_turnover'] = transactions_shops['shop_turnover'] * 100 / total_turnover

In [470]:
shop_ids_TC = [1,2,13,14,16,23,24,26,28,31,37,38,42,43,44,46,50,54,58]
shop_ids_TRK = [3,33,39,40]
shop_ids_SEC = [7,34,36,47,48,49,56]
shop_ids_shopping_center = [4,5,8,15,17,18,19,27,29,30,32,41,45,51,53,59]
shop_ids_moscow = list(range(20,33))


transactions_shops['shop_TC'] = transactions_shops['shop_id'].isin(shop_ids_TC)
transactions_shops['shop_TRK'] = transactions_shops['shop_id'].isin(shop_ids_TRK)
transactions_shops['shop_SEC'] = transactions_shops['shop_id'].isin(shop_ids_SEC)
transactions_shops['shop_shopping_center'] = transactions_shops['shop_id'].isin(shop_ids_shopping_center)
transactions_shops['shop_moscow'] = transactions_shops['shop_id'].isin(shop_ids_moscow)

In [471]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['item_cnt_day'].sum()\
                  .groupby(['shop_id'])['item_cnt_day'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'item_cnt_day'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_units'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')

In [472]:
a = transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()

b = pd.DataFrame(transactions_shops.groupby(['shop_id', 'item_category_id'],as_index=False)['turnover'].sum()\
                  .groupby(['shop_id'])['turnover'].max()).reset_index()

c = b.merge(a, on=['shop_id', 'turnover'],how='left')[['shop_id', 'item_category_id']].rename(columns={'item_category_id':'max_category_turnover'})

transactions_shops = transactions_shops.merge(c, on=['shop_id'], how='left')

-AREA
area



-UNITS
area_units
area_block_units
area_mean_units_block
area_day_units
area_mean_units_day
area_max_units_block
area_min_units_block
area_max_units_day
area_min_units_day

-TURNOVER
area_turnover
area_block_turnover
area_mean_turnover_block
area_day_turnover
area_mean_turnover_day
area_max_turnover_block
area_min_turnover_block
area_max_turnover_day
area_min_turnover_day

-PRICE
area_mean_price
area_mean_price_block


-TREND
area_first_two_blocks_units
area_last_two_blocks_units
area_fluctuation_units_first_last_blocks
area_first_two_blocks_mean_price
area_last_two_blocks_mean_price
area_fluctuation_price_first_last_blocks

-ENCODINGS
area_share_of_total_units
area_share_of_total_gross

In [473]:
shop_areas = {}
for i in range(0,2):
    shop_areas[i] = "Yakutsk"
shop_areas[2] = "Adygea"
shop_areas[3] = "Balashikha"
shop_areas[4] = "Volga"
shop_areas[5] = "Vologda"
for i in range(6,9):
    shop_areas[i] = "Voronezh"
shop_areas[9] = "Outbound Trading"
for i in range(10,12):
    shop_areas[i] = "Zhukovsky"
shop_areas[12] = "Online store emergency"
for i in range(13,15):
    shop_areas[i] = "Kazan"
shop_areas[15] = "Kaluga"
shop_areas[16] = "Kolomna"
for i in range(17,19):
    shop_areas[i] = "Krasnoyarsk"
shop_areas[19] = "Kursk"
for i in range(20,33):
    shop_areas[i] = "Moscow"
shop_areas[33] = "Mytishchi"
for i in range(34,36):
    shop_areas[i] = "N.Novgorod"
for i in range(36,38):
    shop_areas[i] = "Novosibirsk"
shop_areas[38] = "Omsk"
for i in range(39,42):
    shop_areas[i] = "RostovNaDonu"
for i in range(42,44):
    shop_areas[i] = "St. Petersburg"
for i in range(44,46):
    shop_areas[i] = "Samara"
shop_areas[46] = "Sergiev Posad"
shop_areas[47] = "Surgut"
shop_areas[48] = "Tomsk"
for i in range(49,52):
    shop_areas[i] = "Tyumen TC"
for i in range(52,54):
    shop_areas[i] = "Ufa"
shop_areas[54] = "Khimki"
shop_areas[55] = "Digital warehouse"
shop_areas[56] = "Chekhov"
for i in range(57,59):
    shop_areas[i] = "Yakutsk"
shop_areas[59] = "Yaroslavl"

transactions_shops['area'] = transactions_shops['shop_id'].apply(lambda x: shop_areas[x])




In [474]:
transactions_shops['area_units'] = transactions_shops.groupby(['area'])['item_cnt_day'].transform(np.sum)
transactions_shops['area_block_units'] = transactions_shops.groupby(['area','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_shops['area_mean_units_block'] = transactions_shops.groupby(['area'])['area_block_units'].transform(np.mean)
transactions_shops['area_day_units'] = transactions_shops.groupby(['area','date'])['item_cnt_day'].transform(np.sum)
transactions_shops['area_mean_units_day'] = transactions_shops.groupby(['area'])['area_day_units'].transform(np.mean)
transactions_shops['area_max_units_block'] = transactions_shops.groupby(['area'])['area_block_units'].transform(np.max)
transactions_shops['area_min_units_block'] = transactions_shops.groupby(['area'])['area_block_units'].transform(np.min)
transactions_shops['area_max_units_day'] = transactions_shops.groupby(['area'])['area_day_units'].transform(np.max)
transactions_shops['area_min_units_day'] = transactions_shops.groupby(['area'])['area_day_units'].transform(np.min)

In [475]:
transactions_shops['area_turnover'] = transactions_shops.groupby(['area'])['turnover'].transform(np.sum)
transactions_shops['area_block_turnover'] = transactions_shops.groupby(['area','date_block_num'])['turnover'].transform(np.sum)
transactions_shops['area_mean_turnover_block'] = transactions_shops.groupby(['area'])['area_block_turnover'].transform(np.mean)
transactions_shops['area_day_turnover'] = transactions_shops.groupby(['area','date'])['turnover'].transform(np.sum)
transactions_shops['area_mean_turnover_day'] = transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.mean)
transactions_shops['area_max_turnover_block'] = transactions_shops.groupby(['area'])['area_block_turnover'].transform(np.max)
transactions_shops['area_min_turnover_block'] = transactions_shops.groupby(['area'])['area_block_turnover'].transform(np.min)
transactions_shops['area_max_turnover_day'] = transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.max)
transactions_shops['area_min_turnover_day'] = transactions_shops.groupby(['area'])['area_day_turnover'].transform(np.min)

In [476]:
transactions_shops['area_mean_price'] = transactions_shops.groupby('area')['item_price'].transform(np.mean)
transactions_shops['area_mean_price_block'] = transactions_shops.groupby(['area', 'date_block_num'])['item_price'].transform(np.mean)


In [477]:
area_first_two_blocks_units = transactions_shops.groupby(['area','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
area_first_two_blocks_units = area_first_two_blocks_units[area_first_two_blocks_units['is_first_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_first_two_blocks_units'] = transactions_shops['area'].map(area_first_two_blocks_units)

area_last_two_blocks_units = transactions_shops.groupby(['area','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
area_last_two_blocks_units = area_last_two_blocks_units[area_last_two_blocks_units['is_last_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_last_two_blocks_units'] = transactions_shops['area'].map(area_last_two_blocks_units)

transactions_shops['area_fluctuation_units_first_last_blocks'] =  ((transactions_shops['area_first_two_blocks_units'] - transactions_shops['area_last_two_blocks_units'] ) / \
                                                             transactions_shops['area_first_two_blocks_units']) * 100 * -1


area_first_two_blocks_mean_price = transactions_shops.groupby(['area','is_first_two_blocks'], as_index=False)['item_price'].mean()
area_first_two_blocks_mean_price = area_first_two_blocks_mean_price[area_first_two_blocks_mean_price['is_first_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_first_two_blocks_mean_price'] = transactions_shops['area'].map(area_first_two_blocks_mean_price)

area_last_two_blocks_mean_price = transactions_shops.groupby(['area','is_last_two_blocks'], as_index=False)['item_price'].mean()
area_last_two_blocks_mean_price = area_last_two_blocks_mean_price[area_last_two_blocks_mean_price['is_last_two_blocks'] == True].set_index('area').iloc[:,1]
transactions_shops['area_last_two_blocks_mean_price'] = transactions_shops['area'].map(area_last_two_blocks_mean_price)

transactions_shops['area_fluctuation_price_first_last_blocks'] =  ((transactions_shops['area_first_two_blocks_mean_price'] - transactions_shops['area_last_two_blocks_mean_price'] ) / \
                                                             transactions_shops['area_first_two_blocks_mean_price'])  * 100 * -1

shop_category


-UNITS
shop_category_units
shop_category_block_units
shop_category_mean_units_block
shop_category_day_units
shop_category_mean_units_day
shop_category_max_units_block
shop_category_min_units_block
shop_category_max_units_day
shop_category_min_units_day

-TURNOVER
shop_category_turnover
shop_category_block_turnover
shop_category_mean_turnover_block
shop_category_day_turnover
shop_category_mean_turnover_day
shop_category_max_turnover_block
shop_category_min_turnover_block
shop_category_max_turnover_day
shop_category_min_turnover_day

-PRICE
shop_category_mean_price
shop_category_mean_price_block


-TREND
shop_category_first_two_blocks_units
shop_category_last_two_blocks_units
shop_category_fluctuation_units_first_last_blocks
shop_category_first_two_blocks_mean_price
shop_category_last_two_blocks_mean_price
shop_category_fluctuation_price_first_last_blocks

-ENCODINGS
shop_category_share_of_total_units
shop_category_share_of_total_gross

In [478]:
gc.collect()
transactions_shops_categories = transactions.copy()
transactions_shops_categories_blocks = transactions.copy()

In [479]:
transactions_shops_categories_blocks['shop_category_block_units'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['item_cnt_day'].transform(np.sum)
transactions_shops_categories_blocks['shop_category_block_turnover'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id','date_block_num'])['turnover'].transform(np.sum)
transactions_shops_categories_blocks['shop_category_mean_price_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id', 'date_block_num'])['item_price'].transform(np.mean)

In [480]:
transactions_shops_categories['shop_category_units'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_cnt_day'].transform(np.sum)
transactions_shops_categories['shop_category_mean_units_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.mean)
transactions_shops_categories['shop_category_day_units'] = transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['item_cnt_day'].transform(np.sum)
transactions_shops_categories['shop_category_mean_units_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.mean)
transactions_shops_categories['shop_category_max_units_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.max)
transactions_shops_categories['shop_category_min_units_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_units'].transform(np.min)
transactions_shops_categories['shop_category_max_units_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.max)
transactions_shops_categories['shop_category_min_units_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_units'].transform(np.min)


In [481]:
transactions_shops_categories['shop_category_turnover'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['turnover'].transform(np.sum)
transactions_shops_categories['shop_category_mean_turnover_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.mean)
transactions_shops_categories['shop_category_day_turnover'] = transactions_shops_categories.groupby(['shop_id','item_category_id','date'])['turnover'].transform(np.sum)
transactions_shops_categories['shop_category_mean_turnover_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.mean)
transactions_shops_categories['shop_category_max_turnover_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.max)
transactions_shops_categories['shop_category_min_turnover_block'] = transactions_shops_categories_blocks.groupby(['shop_id','item_category_id'])['shop_category_block_turnover'].transform(np.min)
transactions_shops_categories['shop_category_max_turnover_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.max)
transactions_shops_categories['shop_category_min_turnover_day'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['shop_category_day_turnover'].transform(np.min)

In [482]:
transactions_shops_categories['shop_category_mean_price'] = transactions_shops_categories.groupby(['shop_id','item_category_id'])['item_price'].transform(np.mean)


In [535]:
###
#DEBUG
###


pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
#transactions.sample(10).sort_values(by=['item_units'], ascending=False)
#transactions[transactions['item_category_id'] == 58].sample(10).sort_values(by=['total_sales_units'], ascending=False)

In [429]:
chunks = [transactions_items, transactions_categories, transactions_shops, transactions_shops_categories]

print(chunks[3].columns.values)

['item_id' 'date' 'date_block_num' 'shop_id' 'item_price' 'item_cnt_day'
 'day' 'month' 'year' 'item_name' 'item_category_id' 'y' 'turnover'
 'item_first_block' 'item_last_block' 'is_first_two_blocks'
 'is_last_two_blocks' 'shop_category_units' 'shop_category_block_units'
 'shop_category_mean_units_block' 'shop_category_day_units'
 'shop_category_mean_units_day' 'shop_category_max_units_block'
 'shop_category_min_units_block' 'shop_category_max_units_day'
 'shop_category_min_units_day' 'shop_category_turnover'
 'shop_category_block_turnover' 'shop_category_mean_turnover_block'
 'shop_category_day_turnover' 'shop_category_mean_turnover_day'
 'shop_category_max_turnover_block' 'shop_category_min_turnover_block'
 'shop_category_max_turnover_day' 'shop_category_min_turnover_day'
 'shop_category_mean_price' 'shop_category_mean_price_block']


In [483]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [484]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [545]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [546]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num
0,0,54,12
1,0,54,13
2,0,54,14
3,0,54,15
4,0,54,16


In [547]:
len(all_combos)

8333930

In [548]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [549]:
dates = transactions[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
dates_dict

{20: {'month': 9, 'year': 2014},
 15: {'month': 4, 'year': 2014},
 18: {'month': 7, 'year': 2014},
 19: {'month': 8, 'year': 2014},
 21: {'month': 10, 'year': 2014},
 22: {'month': 11, 'year': 2014},
 23: {'month': 12, 'year': 2014},
 24: {'month': 1, 'year': 2015},
 27: {'month': 4, 'year': 2015},
 25: {'month': 2, 'year': 2015},
 12: {'month': 1, 'year': 2014},
 14: {'month': 3, 'year': 2014},
 16: {'month': 5, 'year': 2014},
 17: {'month': 6, 'year': 2014},
 13: {'month': 2, 'year': 2014},
 26: {'month': 3, 'year': 2015},
 28: {'month': 5, 'year': 2015},
 29: {'month': 6, 'year': 2015},
 30: {'month': 7, 'year': 2015},
 31: {'month': 8, 'year': 2015},
 32: {'month': 9, 'year': 2015},
 33: {'month': 10, 'year': 2015}}

In [550]:
all_combos['month'] = all_combos['date_block_num'].apply(lambda block: dates_dict[block]['month'])
all_combos['year'] = all_combos['date_block_num'].apply(lambda block: dates_dict[block]['year'])

In [551]:
transactions_items_columns = ['item_id', 'item_first_block',
       'item_last_block', 'is_first_two_blocks', 'is_last_two_blocks',
       'item_units', 'item_mean_units_block', 'item_day_units',
       'item_mean_units_day', 'item_max_units_block',
       'item_min_units_block', 'item_max_units_day', 'item_min_units_day',
       'item_turnover', 'item_mean_turnover_block', 'item_day_turnover',
       'item_mean_turnover_day', 'item_max_turnover_block',
       'item_min_turnover_block', 'item_max_turnover_day',
       'item_min_turnover_day', 'item_days_of_activity',
       'item_blocks_of_activity', 'item_days_since_start',
       'item_mean_day_between_activity',
       'item_longest_stretch_days_without_activity',
       'item_longest_stretch_blocks_without_activity',
       'item_longest_stretch_block_with_activity',
       'item_number_of_consecutive_days_with_activity',
       'item_days_between_start_and_first_activity',
       'item_blocks_between_start_and_first_activity', 'item_first_day',
       'item_last_day', 'item_activity_on_all_blocks', 'item_mean_price',
       'item_min_price', 'item_max_price', 'item_number_different_prices',
       'item_price_amplitude', 'category_mean_price',
       'item_deviation_mean_category_price',
       'item_first_two_blocks_units', 'item_last_two_blocks_units',
       'item_fluctuation_units_first_last_blocks',
       'item_first_two_blocks_mean_price',
       'item_last_two_blocks_mean_price',
       'item_fluctuation_price_first_last_blocks',
       'item_share_of_total_units', 'item_share_of_total_turnover',
       'category_units', 'item_share_of_category_units',
       'category_turnover', 'item_share_of_category_turnover']

In [553]:
del training
gc.collect()
training = pd.merge(all_combos, transactions_items[transactions_items_columns].drop_duplicates('item_id'), on=['item_id'], how='left', copy=False)

In [554]:
transactions_items_blocks_columns = ['item_id', 'date_block_num', 'item_block_units', 'item_block_turnover', 'item_mean_price_block']

In [555]:
gc.collect()
training = pd.merge(training, transactions_items_blocks[transactions_items_blocks_columns]\
                    .drop_duplicates(['item_id', 'date_block_num']), on=['item_id','date_block_num'], how='left', copy=False)

In [556]:
transactions_categories_columns = [
       'item_category_id',
       'category_units', 'category_mean_units_block',
       'category_day_units', 'category_mean_units_day',
       'category_max_units_block', 'category_min_units_block',
       'category_max_units_day', 'category_min_units_day',
       'category_turnover', 'category_mean_turnover_block',
       'category_day_turnover', 'category_mean_turnover_day',
       'category_max_turnover_block', 'category_min_turnover_block',
       'category_max_turnover_day', 'category_min_turnover_day',
       'category_mean_price', 'category_min_price', 'category_max_price',
       'category_first_two_blocks_units',
       'category_last_two_blocks_units',
       'category_fluctuation_units_first_last_blocks',
       'category_first_two_blocks_mean_price',
       'category_last_two_blocks_mean_price',
       'category_fluctuation_price_first_last_blocks', 'subcategory',
       'video_game', 'gaming_old_gen', 'gaming_new_gen', 'pc_games',
       'payment_cards', 'movies', 'movies_niche', 'books', 'music',
       'music_CD', 'music_vinyl', 'gifts', 'software',
       'subcategory_units', 'subcategory_block_units',
       'subcategory_mean_units_block', 'subcategory_day_units',
       'subcategory_mean_units_day', 'subcategory_max_units_block',
       'subcategory_min_units_block', 'subcategory_max_units_day',
       'subcategory_min_units_day', 'subcategory_turnover',
       'subcategory_block_turnover', 'subcategory_mean_turnover_block',
       'subcategory_day_turnover', 'subcategory_mean_turnover_day',
       'subcategory_max_turnover_block', 'subcategory_min_turnover_block',
       'subcategory_max_turnover_day', 'subcategory_min_turnover_day',
       'category_share_of_total_units',
       'category_share_of_total_turnover',
       'subcategory_share_of_total_units',
       'subcategory_share_of_total_turnover',
       'subcategory_first_two_blocks_units',
       'subcategory_last_two_blocks_units',
       'subcategory_fluctuation_units_first_last_blocks',
       'subcategory_first_two_blocks_mean_price',
       'subcategory_last_two_blocks_mean_price',
       'subcategory_fluctuation_price_first_last_blocks']


In [None]:
gc.collect()
training = pd.merge(training, transactions_categories[transactions_categories_columns]\
                    .drop_duplicates('item_category_id'), on=['item_category_id'], how='left', copy=False)

In [None]:
transactions_categories_blocks_columns = ['item_category_id', 'date_block_num', 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block', 'subcategory_block_units',
 'subcategory_block_turnover',
 'subcategory_mean_price_block']

In [None]:
gc.collect()
training = pd.merge(training, transactions_categories_blocks[transactions_categories_blocks_columns]\
                    .drop_duplicates(['item_category_id', 'date_block_num']), on=['item_category_id','date_block_num'], how='left', copy=False)

In [None]:
transactions_shops_columns = ['shop_id', 
       'shop_units', 'shop_mean_units_block', 'shop_day_units',
       'shop_mean_units_day', 'shop_max_units_block',
       'shop_min_units_block', 'shop_max_units_day', 'shop_min_units_day',
       'shop_turnover', 'shop_mean_turnover_block', 'shop_day_turnover',
       'shop_mean_turnover_day', 'shop_max_turnover_block',
       'shop_min_turnover_block', 'shop_max_turnover_day',
       'shop_min_turnover_day', 'shop_mean_price',
       'shop_first_two_blocks_units', 'shop_last_two_blocks_units',
       'shop_fluctuation_units_first_last_blocks',
       'shop_first_two_blocks_mean_price',
       'shop_last_two_blocks_mean_price',
       'shop_fluctuation_price_first_last_blocks', 'shop_share_of_units',
       'shop_share_of_turnover', 'shop_TC', 'shop_TRK', 'shop_SEC',
       'shop_shopping_center', 'shop_moscow', 'max_category_units',
       'max_category_turnover', 'area', 'area_units', 'area_block_units',
       'area_mean_units_block', 'area_day_units', 'area_mean_units_day',
       'area_max_units_block', 'area_min_units_block',
       'area_max_units_day', 'area_min_units_day', 'area_turnover',
       'area_block_turnover', 'area_mean_turnover_block',
       'area_day_turnover', 'area_mean_turnover_day',
       'area_max_turnover_block', 'area_min_turnover_block',
       'area_max_turnover_day', 'area_min_turnover_day',
       'area_mean_price', 'area_mean_price_block',
       'area_first_two_blocks_units', 'area_last_two_blocks_units',
       'area_fluctuation_units_first_last_blocks',
       'area_first_two_blocks_mean_price',
       'area_last_two_blocks_mean_price',
       'area_fluctuation_price_first_last_blocks']

In [None]:
gc.collect()
training = pd.merge(training, transactions_shops[transactions_shops_columns]\
                    .drop_duplicates('shop_id'), on=['shop_id'], how='left', copy=False)

In [None]:
transactions_shops_blocks_columns = ['shop_id', 'date_block_num',  'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block', 'area_block_units',
 'area_block_turnover',
 'area_mean_price_block']

In [None]:
gc.collect()
training = pd.merge(training, transactions_shops_blocks[transactions_shops_blocks_columns]\
                    .drop_duplicates(['shop_id', 'date_block_num']), on=['shop_id', 'date_block_num'], how='left', copy=False)

In [None]:
transactions_shops_categories_columns = [ 'shop_id', 
       'item_category_id',
       'shop_category_units', 'shop_category_mean_units_block',
       'shop_category_day_units', 'shop_category_mean_units_day',
       'shop_category_max_units_block', 'shop_category_min_units_block',
       'shop_category_max_units_day', 'shop_category_min_units_day',
       'shop_category_turnover', 'shop_category_mean_turnover_block',
       'shop_category_day_turnover', 'shop_category_mean_turnover_day',
       'shop_category_max_turnover_block',
       'shop_category_min_turnover_block',
       'shop_category_max_turnover_day', 'shop_category_min_turnover_day',
       'shop_category_mean_price']

In [None]:
gc.collect()
training = pd.merge(training, transactions_shops_categories[transactions_shops_categories_columns]\
                    .drop_duplicates(['shop_id','item_category_id']), on=['shop_id','item_category_id'], how='left', copy=False)

In [None]:
transactions_shops_categories_blocks_columns = ['shop_id', 'item_category_id', 'date_block_num',   'shop_category_block_units',
 'shop_category_block_turnover',
 'shop_category_mean_price_block']

In [None]:
gc.collect()
training = pd.merge(training, transactions_shops_categories_blocks[transactions_shops_categories_blocks_columns]\
                    .drop_duplicates(['shop_id','item_category_id','date_block_num']), \
                    on=['shop_id','item_category_id','date_block_num'], how='left', copy=False)

In [None]:
training.head()

In [None]:
len(training)

In [None]:
lag_columns = [
 'item_block_units',
 'item_block_turnover',
 'item_mean_price_block',
 'category_block_units',
 'category_block_turnover',
 'category_mean_price_block',
 'subcategory_block_units',
 'subcategory_block_turnover',
 'subcategory_mean_price_block',
 'shop_block_units',
 'shop_block_turnover',
 'shop_mean_price_block',
 'area_block_units',
 'area_block_turnover',
 'area_mean_price_block',
 'shop_category_block_units',
 'shop_category_turnover',
 'shop_category_mean_price_block'
 ]