In [73]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [74]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb

In [75]:
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [76]:
transactions = sales_train
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)


In [77]:
transactions = transactions.set_index('item_id').join(items.set_index('item_id'))
transactions.reset_index(inplace=True)
transactions = transactions[transactions['year'] != 2013]
transactions['y'] = transactions.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].transform('sum').clip(0,20)


In [78]:
print(len(transactions))
transactions.head()

1668287


Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y
0,0,01.09.2014,20,54,58.0,1.0,1,9,2014,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,1.0
1,1,04.04.2014,15,55,4490.0,1.0,4,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
2,1,02.04.2014,15,55,4490.0,1.0,2,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
3,1,06.07.2014,18,55,4490.0,1.0,6,7,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0
4,1,04.08.2014,19,55,4490.0,1.0,4,8,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0


In [79]:
len(transactions)

1668287

In [80]:
transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']

In [212]:
number_of_items = transactions['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = transactions['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = transactions['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = transactions['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = transactions['item_cnt_day'].sum()
print("total_sales:", total_sales)
total_gross = transactions['gross'].sum()
print("total_gross:", total_gross)
average_price = transactions['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
total_gross: 2181401610.589987
average_price: 1015.5023073770728


In [82]:
transactions['total_sales_units'] = transactions.groupby(['item_id'])['item_cnt_day'].transform(np.sum)
transactions['block_sales_units'] = transactions.groupby(['item_id','date_block_num'])['item_cnt_day'].transform(np.sum)

In [83]:
def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

transactions['number_of_days_since_beginning'] = transactions.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1)

In [84]:
transactions['first_block_sale'] = transactions.groupby(['item_id'])['date_block_num'].transform(np.min)
transactions['last_block_sale'] = transactions.groupby(['item_id'])['date_block_num'].transform(np.max)
transactions['first_day_sale'] = transactions.groupby(['item_id'])['number_of_days_since_beginning'].transform(np.min)
transactions['last_day_sale'] = transactions.groupby(['item_id'])['number_of_days_since_beginning'].transform(np.max)
transactions['first_year_sale'] = transactions.groupby(['item_id'])['year'].transform(np.min)
transactions['last_year_sale'] = transactions.groupby(['item_id'])['year'].transform(np.max)

In [85]:
transactions['sold_two_years'] = transactions['last_year_sale'] > transactions['first_year_sale']

In [86]:
transactions['total_days_of_sales'] = transactions['last_day_sale'] - transactions['first_day_sale'] + 1
transactions['average_sales_units_day'] = transactions['total_sales_units'] / transactions['total_days_of_sales']

In [87]:
transactions['total_blocks_of_sales'] = transactions['last_block_sale'] - transactions['first_block_sale'] + 1
transactions['average_sales_units_block'] = transactions['total_sales_units'] / transactions['total_blocks_of_sales']

In [89]:
number_of_days_with_a_sale = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby('item_id').size()

In [90]:
transactions['number_of_days_with_a_sale'] = transactions['item_id'].map(number_of_days_with_a_sale)

In [91]:
number_of_blocks_with_a_sale = transactions.groupby(['item_id', 'date_block_num'], as_index=False).first().groupby('item_id').size()
transactions['number_of_blocks_with_a_sale'] = transactions['item_id'].map(number_of_blocks_with_a_sale)

In [92]:
max_day_sale = transactions.groupby(['item_id', 'date'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].max()
transactions['max_day_sale'] = transactions['item_id'].map(max_day_sale)

min_day_sale = transactions.groupby(['item_id', 'date'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].min()
transactions['min_day_sale'] = transactions['item_id'].map(min_day_sale)


In [93]:
max_block_sale = transactions.groupby(['item_id', 'date_block_num'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].max()
transactions['max_block_sale'] = transactions['item_id'].map(max_block_sale)

min_block_sale = transactions.groupby(['item_id', 'date_block_num'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].min()
transactions['min_block_sale'] = transactions['item_id'].map(min_block_sale)


In [94]:
gc.collect()

def get_max_stretch_without_sales_days(days):
    days = np.unique(days)
    max_stretch = 0
    len_days = len(days)
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return max_stretch
        next_day = days[index+1]
        stretch = next_day - day
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_days(x))

transactions['max_stretch_in_days_without_sales'] = transactions['item_id'].map(max_stretch_without_sales)

In [95]:
gc.collect()

def get_max_stretch_without_sales_block(blocks):
    blocks = np.unique(blocks)
    max_stretch = 0
    len_blocks = len(blocks)
    for index,block in enumerate(sorted(blocks)):
        if index == len_blocks - 1:
            return max_stretch
        next_block = blocks[index+1]
        stretch = next_block - block
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales_block = transactions.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_block(x))

transactions['max_stretch_in_blocks_without_sales'] = transactions['item_id'].map(max_stretch_without_sales_block)

In [96]:
def get_following_pairs(days):
    days = np.unique(days)
    len_days = len(days)
    following = []
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return following
        next_day = days[index+1]
        if next_day == day + 1:
            following.append([day, next_day])
        
assert(get_following_pairs([1,2,5,6,7,8,9,11,12,15]) == [[1, 2], [5, 6], [6, 7], [7, 8], [8, 9], [11, 12]])
assert(get_following_pairs([1,2,5,6,7,10]) == [[1, 2], [5, 6], [6, 7]])
assert(get_following_pairs([1,2,4,5,7,9,10]) == [[1, 2], [4, 5], [9,10]])
assert(get_following_pairs([1,2,4,5,7,9,10,11,12,15]) == [[1, 2], [4, 5], [9,10],[10,11],[11,12]])

In [97]:
def get_longest_stretch(following_pairs, n=1,new_n=1):
    #print("following_pairs", following_pairs, " n: ", n, " new_n: ", new_n)
    len_pairs = len(following_pairs)
    if len_pairs == 0:
        return 0
    if len_pairs == 1:
        if new_n > n:
            return new_n
        return n
    if following_pairs[1][0] == following_pairs[0][1]:
        new_n+=1
    else:
        if new_n > n:
            n=new_n
        new_n=1
    return get_longest_stretch(following_pairs[1:], n,new_n)


assert(get_longest_stretch([]) == 0)
assert(get_longest_stretch([[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[-1, 0],[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[20, 21], [25,26]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5] ,[8,9], [9,10],[10, 11],[14, 15], [15,16],[18,19] ,[22,23], [23,24],[24, 25]]) == 3)
assert(get_longest_stretch([[1, 2], [4,5], [7, 8]]) == 1)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [14, 15], [15, 16], [16, 17]]) == 3)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15], [20, 21], [21, 22], [22,23],[23,24]]) == 4)

In [98]:
gc.collect()

max_stretch_with_sales_days = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions['max_stretch_in_days_with_sales'] = transactions['item_id'].map(max_stretch_with_sales_days)

gc.collect()

max_stretch_with_sales_blocks = transactions.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions['max_stretch_in_blocks_with_sales'] = transactions['item_id'].map(max_stretch_with_sales_blocks)

In [99]:
def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_average_days_between_sales(x))

transactions['average_days_between_sales'] = transactions['item_id'].map(average_days_between_sales)

In [100]:
transactions['possibly_released_during_period'] = ((transactions['max_stretch_in_blocks_without_sales'] > 8) & (transactions['number_of_days_with_a_sale'] > 200)).astype(bool)

In [155]:
transactions['share_of_total_sold'] = transactions['total_sales_units'] * 100 / total_sales 

transactions['item_gross'] = transactions.groupby('item_id')['gross'].transform(np.sum)
transactions['share_of_total_gross'] = transactions['item_gross'] * 100 / total_gross 

In [102]:
transactions['item_min_price'] = transactions.groupby('item_id')['item_price'].transform(np.min)
transactions['item_max_price'] = transactions.groupby('item_id')['item_price'].transform(np.max)
transactions['price_fluctuation'] = ((transactions['item_max_price'] - transactions['item_min_price'] ) / transactions['item_min_price']) * 100
transactions['number_of_different_prices'] = transactions.groupby('item_id')['item_price'].transform('nunique')
transactions['item_average_price'] = transactions.groupby('item_id')['item_price'].transform(np.mean)



In [222]:
a = transactions.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
b = a[a['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['item_average_price_first_two_blocks'] = transactions['item_id'].map(b)

c = transactions.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
d = c[c['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['item_average_price_last_two_blocks'] = transactions['item_id'].map(d)

transactions['fluctuation_item_average_price_first_last_two_blocks'] =  ((transactions['item_average_price_first_two_blocks'] \
                - transactions['item_average_price_last_two_blocks'] ) / transactions['item_average_price_first_two_blocks']) * 100 * -1

In [103]:
transactions['category_average_price'] = transactions.groupby(['item_category_id'])['item_price'].transform(np.mean)
transactions['deviation_category_price'] =  ((transactions['item_average_price'] - transactions['category_average_price'] ) / transactions['category_average_price']) * 100

In [126]:
list(range(1,8))

[1, 2, 3, 4, 5, 6, 7]

In [105]:

def is_first_two_blocks(first_block_sale, block):
    return block in [first_block_sale, first_block_sale+1]
def is_last_two_blocks(last_block_sale, block):
    return block in [last_block_sale - 1, last_block_sale]


transactions['is_first_two_blocks'] = transactions.apply(lambda row: is_first_two_blocks(row['first_block_sale'], row['date_block_num']),axis=1)
transactions['is_last_two_blocks'] = transactions.apply(lambda row: is_last_two_blocks(row['last_block_sale'], row['date_block_num']),axis=1)

In [106]:
a = transactions.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
b = a[a['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['first_two_blocks_sales_units'] = transactions['item_id'].map(b)

c = transactions.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
d = c[c['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['last_two_blocks_sales_units'] = transactions['item_id'].map(d)

In [235]:
transactions['number_of_blocks_between_target_and_first'] = 34 - transactions['first_block_sale']
transactions['fluctuation_first_last_two_blocks'] =  ((transactions['first_two_blocks_sales_units'] - transactions['last_two_blocks_sales_units'] ) / transactions['first_two_blocks_sales_units']) * 100 * -1

In [108]:
transactions['new_release'] = 34 - transactions['first_block_sale'] < 4

In [139]:
sub_cats = {}
for i in range(1,8):
    sub_cats[i] = "Accessories"
sub_cats[8] = "Tickets "
sub_cats[9] = "Delivery of goods"
for i in range(10,18):
    sub_cats[i] = "Consoles"
for i in range(18,25):
    sub_cats[i] = "Game for Consoles"
sub_cats[25] = "Accessories for Games"
sub_cats[26] = "Android Games"
sub_cats[27] = "MAC Games"
for i in range(28,32):
    sub_cats[i] = "PC Games"
for i in range(32,37):
    sub_cats[i] = "Payment Cards"
for i in range(37,40):
    sub_cats[i] = "Cinema - Blu-ray"
sub_cats[40] = "Cinema - DVD"
sub_cats[41] = "Cinema - Collectible"
for i in range(42,46):
    sub_cats[i] = "Audiobooks"
for i in range(46,55):
    sub_cats[i] = "Books"
for i in range(55,57):
    sub_cats[i] = "Music - CD"
sub_cats[57] = "Music - MP3"
sub_cats[58] = "Music - Vinyl"
sub_cats[59] = "Music - Music Video"
sub_cats[60] = "Music - Gift Edition"
for i in range(61,74):
    sub_cats[i] = "Gifts"
for i in range(73,79):
    sub_cats[i] = "Software"
sub_cats[79] = "Utility"
for i in range(80,84):
    sub_cats[i] = "Misc"

In [140]:
transactions['subcategory'] = transactions['item_category_id'].apply(lambda x: sub_cats[x])

In [148]:
transactions['video_game'] = transactions["item_category_id"].isin(list(range(18,32)))
transactions['gaming_old_gen'] = transactions["item_category_id"].isin([10,11,15,18,19,23])
transactions['gaming_new_gen'] = transactions["item_category_id"].isin([12,14,16,20,22,24])
transactions['pc_games'] = transactions["item_category_id"].isin(list(range(27,32)))
transactions['payment_cards'] = transactions["item_category_id"].isin(list(range(32,37)))
transactions['movies'] = transactions["item_category_id"].isin(list(range(37,42)))
transactions['movies_niche'] = transactions["item_category_id"].isin([38,39])
transactions['books'] = transactions["item_category_id"].isin([42,55])
transactions['music'] = transactions["item_category_id"].isin(list(range(55,61)))
transactions['music_CD'] = transactions["item_category_id"].isin([55,56])
transactions['music_vinyl'] = transactions["item_category_id"].isin([58])
transactions['gifts'] = transactions["item_category_id"].isin(list(range(61,72)))
transactions['software'] = transactions["item_category_id"].isin(list(range(73,79)))

In [156]:
transactions['category_total_sales_units'] = transactions.groupby('item_category_id')['item_cnt_day'].transform(np.sum)
transactions['category_share_of_total_sold'] = transactions['category_total_sales_units'] * 100 / total_sales

transactions['category_gross'] = transactions.groupby('item_category_id')['gross'].transform(np.sum)
transactions['category_share_of_total_gross'] = transactions['category_gross'] / total_gross * 100

In [157]:
transactions['subcategory_total_sales_units'] = transactions.groupby('subcategory')['item_cnt_day'].transform(np.sum)
transactions['subcategory_share_of_total_sold'] = transactions['subcategory_total_sales_units'] / total_sales * 100

transactions['subcategory_gross'] = transactions.groupby('subcategory')['gross'].transform(np.sum)
transactions['subcategory_share_of_total_gross'] = transactions['subcategory_gross'] / total_gross * 100

In [160]:
a = transactions.groupby(['item_category_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
b = a[a['is_first_two_blocks'] == True].set_index('item_category_id').iloc[:,1]

transactions['first_two_blocks_category_sales_units'] = transactions['item_category_id'].map(b)

c = transactions.groupby(['item_category_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
d = c[c['is_last_two_blocks'] == True].set_index('item_category_id').iloc[:,1]

transactions['last_two_blocks_category_sales_units'] = transactions['item_category_id'].map(d)

In [237]:
transactions['fluctuation_category_first_last_two_blocks'] =  ((transactions['first_two_blocks_category_sales_units'] - transactions['last_two_blocks_category_sales_units'] ) / transactions['first_two_blocks_category_sales_units']) * 100 * -1

In [163]:
a = transactions.groupby(['subcategory','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
b = a[a['is_first_two_blocks'] == True].set_index('subcategory').iloc[:,1]

transactions['first_two_blocks_subcategory_sales_units'] = transactions['subcategory'].map(b)

c = transactions.groupby(['subcategory','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
d = c[c['is_last_two_blocks'] == True].set_index('subcategory').iloc[:,1]

transactions['last_two_blocks_subcategory_sales_units'] = transactions['subcategory'].map(d)

In [239]:
transactions['fluctuation_subcategory_first_last_two_blocks'] =  ((transactions['first_two_blocks_subcategory_sales_units'] - transactions['last_two_blocks_subcategory_sales_units'] ) / transactions['first_two_blocks_subcategory_sales_units']) * 100 * -1

In [184]:
shop_areas = {}
for i in range(0,2):
    shop_areas[i] = "Yakutsk"
shop_areas[2] = "Adygea"
shop_areas[3] = "Balashikha"
shop_areas[4] = "Volga"
shop_areas[5] = "Vologda"
for i in range(6,9):
    shop_areas[i] = "Voronezh"
shop_areas[9] = "Outbound Trading"
for i in range(10,12):
    shop_areas[i] = "Zhukovsky"
shop_areas[12] = "Online store emergency"
for i in range(13,15):
    shop_areas[i] = "Kazan"
shop_areas[15] = "Kaluga"
shop_areas[16] = "Kolomna"
for i in range(17,19):
    shop_areas[i] = "Krasnoyarsk"
shop_areas[19] = "Kursk"
for i in range(20,33):
    shop_areas[i] = "Moscow"
shop_areas[33] = "Mytishchi"
for i in range(34,36):
    shop_areas[i] = "N.Novgorod"
for i in range(36,38):
    shop_areas[i] = "Novosibirsk"
shop_areas[38] = "Omsk"
for i in range(39,42):
    shop_areas[i] = "RostovNaDonu"
for i in range(42,44):
    shop_areas[i] = "St. Petersburg"
for i in range(44,46):
    shop_areas[i] = "Samara"
shop_areas[46] = "Sergiev Posad"
shop_areas[47] = "Surgut"
shop_areas[48] = "Tomsk"
for i in range(49,52):
    shop_areas[i] = "Tyumen TC"
for i in range(52,54):
    shop_areas[i] = "Ufa"
shop_areas[54] = "Khimki"
shop_areas[55] = "Digital warehouse"
shop_areas[56] = "Chekhov"
for i in range(57,59):
    shop_areas[i] = "Yakutsk"
shop_areas[59] = "Yaroslavl"

In [185]:
transactions['area'] = transactions['shop_id'].apply(lambda x: shop_areas[x])

In [170]:
shop_ids_TC = [1,2,13,14,16,23,24,26,28,31,37,38,42,43,44,46,50,54,58]
shop_ids_TRK = [3,33,39,40]
shop_ids_SEC = [7,34,36,47,48,49,56]
shop_ids_shopping_center = [4,5,8,15,17,18,19,27,29,30,32,41,45,51,53,59]
shop_ids_moscow = list(range(20,33))

In [171]:
transactions['shop_TC'] = transactions['shop_id'].isin(shop_ids_TC)
transactions['shop_TRK'] = transactions['shop_id'].isin(shop_ids_TRK)
transactions['shop_SEC'] = transactions['shop_id'].isin(shop_ids_SEC)
transactions['shop_shopping_center'] = transactions['shop_id'].isin(shop_ids_shopping_center)
transactions['shop_moscow'] = transactions['shop_id'].isin(shop_ids_moscow)

In [190]:
transactions['shop_total_sales_units'] = transactions.groupby(['shop_id'])['item_cnt_day'].transform(np.sum)
transactions['shop_block_sales_units'] = transactions.groupby(['shop_id','date_block_num'])['item_cnt_day'].transform(np.sum)

In [192]:
transactions['area_total_sales_units'] = transactions.groupby(['area'])['item_cnt_day'].transform(np.sum)
transactions['area_block_sales_units'] = transactions.groupby(['area','date_block_num'])['item_cnt_day'].transform(np.sum)

In [194]:
transactions['shop_share_of_total_sold'] = transactions['shop_total_sales_units'] * 100 / total_sales 

transactions['shop_gross'] = transactions.groupby('shop_id')['gross'].transform(np.sum)
transactions['shop_share_of_total_gross'] = transactions['shop_gross'] * 100 / total_gross 

In [195]:
transactions['area_share_of_total_sold'] = transactions['area_total_sales_units'] * 100 / total_sales 

transactions['area_gross'] = transactions.groupby('area')['gross'].transform(np.sum)
transactions['area_share_of_total_gross'] = transactions['area_gross'] * 100 / total_gross 

In [213]:
transactions['shop_average_price'] =  transactions.groupby('shop_id')['item_price'].transform(np.mean)
transactions['shop_price_fluctuation'] = ((transactions['shop_average_price'] - average_price ) / average_price) * 100



In [220]:
a = transactions.groupby(['shop_id','is_first_two_blocks'], as_index=False)['item_price'].mean()
b = a[a['is_first_two_blocks'] == True].set_index('shop_id').iloc[:,1]

transactions['shop_average_price_first_two_blocks'] = transactions['shop_id'].map(b)

c = transactions.groupby(['shop_id','is_last_two_blocks'], as_index=False)['item_price'].mean()
d = c[c['is_last_two_blocks'] == True].set_index('shop_id').iloc[:,1]

transactions['shop_average_price_last_two_blocks'] = transactions['shop_id'].map(d)

transactions['fluctuation_shop_average_price_first_last_two_blocks'] =  ((transactions['shop_average_price_first_two_blocks'] \
                - transactions['shop_average_price_last_two_blocks'] ) / transactions['shop_average_price_first_two_blocks']) * 100 * -1

In [206]:
day_mean = transactions.groupby(['shop_id','date'],as_index=False)['item_cnt_day'].sum().groupby('shop_id')['item_cnt_day'].mean()
transactions['shop_average_sales_units_day'] = transactions['shop_id'].map(day_mean)

day_gross = transactions.groupby(['shop_id','date'],as_index=False)['gross'].sum().groupby('shop_id')['gross'].mean()
transactions['shop_average_gross_day'] = transactions['shop_id'].map(day_gross)

block_mean = transactions.groupby(['shop_id','date_block_num'],as_index=False)['item_cnt_day'].sum().groupby('shop_id')['item_cnt_day'].mean()
transactions['shop_average_sales_units_block'] = transactions['shop_id'].map(block_mean)

block_gross = transactions.groupby(['shop_id','date_block_num'],as_index=False)['gross'].sum().groupby('shop_id')['gross'].mean()
transactions['shop_average_gross_block'] = transactions['shop_id'].map(block_gross)

In [205]:
transactions.groupby(['shop_id','date_block_num'],as_index=False)['item_cnt_day'].sum().groupby('shop_id')['item_cnt_day'].mean()

shop_id
2     939.045455 
3     825.272727 
4     1133.636364
5     1275.727273
6     2449.545455
7     1766.909091
9     3113.333333
10    637.666667 
11    572.000000 
12    2467.136364
13    779.250000 
14    1195.181818
15    1769.227273
16    1430.090909
17    1226.384615
18    1450.272727
19    1850.136364
20    2936.000000
21    1970.681818
22    1515.090909
24    1634.000000
25    6334.000000
26    1667.000000
27    4111.550000
28    4813.636364
29    1633.470588
30    1951.214286
31    8104.090909
33    609.111111 
34    403.187500 
35    1861.318182
36    330.000000 
37    1118.954545
38    1513.545455
39    830.850000 
40    449.363636 
41    1119.500000
42    4145.545455
43    1877.615385
44    1086.909091
45    1013.045455
46    1960.590909
47    2061.181818
48    1311.000000
49    767.863636 
50    1562.681818
51    1075.714286
52    1257.227273
53    1621.681818
54    5590.937500
55    2543.409091
56    1977.090909
57    3679.272727
58    2252.818182
59    1212.954545
Na

In [230]:
print(transactions[(transactions['item_id'] == 3076) & (transactions['date_block_num'] == 12)]['item_price'].mean())
print(transactions[(transactions['item_id'] == 3076) & (transactions['date_block_num'] == 32)]['item_price'].mean())

710.2181800766264
1191.3970588235295


In [240]:
###
#DEBUG
###


pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
transactions.sample(10).sort_values(by=['total_sales_units'], ascending=False)
#transactions[transactions['item_category_id'] == 58].sample(10).sort_values(by=['total_sales_units'], ascending=False)

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y,gross,total_sales_units,block_sales_units,number_of_days_since_beginning,first_block_sale,last_block_sale,first_day_sale,last_day_sale,first_year_sale,last_year_sale,sold_two_years,total_days_of_sales,average_sales_units_day,total_blocks_of_sales,average_sales_units_block,number_of_days_with_a_sale,number_of_blocks_with_a_sale,max_day_sale,min_day_sale,max_block_sale,min_block_sale,max_stretch_in_days_without_sales,max_stretch_in_blocks_without_sales,max_stretch_in_days_with_sales,max_stretch_in_blocks_with_sales,average_days_between_sales,possibly_released_during_period,share_of_total_sold,item_gross,share_of_total_gross,item_min_price,item_max_price,price_fluctuation,number_of_different_prices,item_average_price,category_average_price,deviation_category_price,is_first_two_blocks,is_last_two_blocks,first_two_blocks_sales_units,last_two_blocks_sales_units,number_of_blocks_between_target_and_first,fluctuation_first_last_two_blocks,new_release,subcategory,video_game,gaming_old_gen,gaming_new_gen,pc_games,payment_cards,movies,movies_niche,books,music,music_CD,music_vinyl,gifts,software,subcategory_total_sales_units,subcategory_share_of_total_sold,subcategory_gross,subcategory_share_of_total_gross,category_total_sales_units,category_share_of_total_sold,category_gross,category_share_of_total_gross,first_two_blocks_category_sales_units,last_two_blocks_category_sales_units,fluctuation_category_first_last_two_blocks,first_two_blocks_subcategory_sales_units,last_two_blocks_subcategory_sales_units,fluctuation_subcategory_first_last_two_blocks,shop_TC,shop_TRK,shop_SEC,shop_shopping_center,shop_moscow,area,shop_total_sales_units,shop_block_sales_units,area_total_sales_units,area_block_sales_units,shop_share_of_total_sold,shop_gross,shop_share_of_total_gross,area_share_of_total_sold,area_gross,area_share_of_total_gross,shop_average_price,shop_average_sales_units_day,shop_average_gross_day,shop_average_sales_units_blocl,shop_average_gross_block,shop_price_fluctuation,shop_average_price_first_two_blocks,shop_average_price_last_two_blocks,fluctuation_shop_average_price_first_last_two_blocks,item_average_price_first_two_blocks,item_average_price_last_two_blocks,fluctuation_item_average_price_first_last_two_blocks
488039,3335,19.07.2015,30,27,1199.0,1.0,19,7,2015,"FIFA 14 [Xbox 360, русская версия]",23,2.0,1199.0,1935.0,17.0,567,12,33,1,634,2014,2015,True,634,3.05205,22,87.954545,486,22,26.0,-1.0,301.0,2.0,33,1,31,21,0.0072,False,0.092785,3492389.09,0.160098,599.0,2499.0,317.195326,60.0,1806.501504,1438.573719,25.575873,False,False,487.0,12.0,22,-97.535934,False,Game for Consoles,True,True,False,False,False,False,False,False,False,False,False,False,False,380925.0,18.26564,747750600.0,34.278445,99947.0,4.792534,144823700.0,6.639021,27782.0,6355.0,-77.125477,151918.0,31797.0,-79.06963,False,False,False,True,True,Moscow,82231.0,2478.0,716036.0,21508.0,3.943038,93551750.0,4.288607,34.334465,713504200.0,32.70852,1134.321412,145.028219,164994.272698,4111.55,4677588.0,11.700525,1221.497558,690.786534,-43.447571,1697.588846,926.318182,-45.433302
535072,3476,18.06.2014,17,16,799.0,1.0,18,6,2014,Fight Night Champion [Xbox 360],23,3.0,799.0,1163.0,62.0,135,12,31,1,558,2014,2015,True,558,2.084229,20,58.15,383,20,33.0,0.0,234.0,2.0,34,1,30,19,0.009792,False,0.055767,744678.39,0.034138,399.3,799.0,100.100175,28.0,667.887354,1438.573719,-53.572949,False,False,331.0,4.0,22,-98.791541,False,Game for Consoles,True,True,False,False,False,False,False,False,False,False,False,False,False,380925.0,18.26564,747750600.0,34.278445,99947.0,4.792534,144823700.0,6.639021,27782.0,6355.0,-77.125477,151918.0,31797.0,-79.06963,True,False,False,False,False,Kolomna,31462.0,1517.0,31462.0,1517.0,1.508627,35641980.0,1.633903,1.508627,35641980.0,1.633903,1132.070151,47.24024,53516.481441,1430.090909,1620090.0,11.478836,1299.006586,1396.913656,7.537073,532.813732,799.0,49.958598
1236903,7213,07.01.2014,12,47,899.0,1.0,7,1,2014,"Uncharted 2: Among Thieves (Essentials) [PS3, русская версия]",19,4.0,899.0,871.0,96.0,7,12,33,1,638,2014,2015,True,638,1.365204,22,39.590909,420,22,10.0,-1.0,96.0,3.0,34,1,30,21,0.008578,False,0.041765,823990.95,0.037773,614.06,1199.0,95.257792,14.0,947.380846,1509.64516,-37.2448,True,False,154.0,14.0,22,-90.909091,False,Game for Consoles,True,True,False,False,False,False,False,False,False,False,False,False,False,380925.0,18.26564,747750600.0,34.278445,120462.0,5.776244,185603000.0,8.508428,37696.0,7369.0,-80.451507,151918.0,31797.0,-79.06963,False,False,True,False,False,Surgut,45346.0,2153.0,45346.0,2153.0,2.174375,55157650.0,2.528542,2.174375,55157650.0,2.528542,1226.19919,67.985007,82695.123043,2061.181818,2507166.0,20.748046,1463.512282,1258.989796,-13.974771,895.751634,1199.0,33.854068
610031,3835,13.09.2015,32,35,1191.16,1.0,13,9,2015,"Halo 4. Game of the Year Edition [Xbox 360, русская версия]",23,4.0,1191.16,863.0,21.0,622,13,33,3,636,2014,2015,True,634,1.361199,21,41.095238,371,21,18.0,-1.0,118.0,6.0,46,1,30,20,0.009621,False,0.041381,1099212.04,0.05039,555.54,2199.0,295.831083,69.0,1297.657993,1438.573719,-9.795516,False,True,130.0,36.0,21,-72.307692,False,Game for Consoles,True,True,False,False,False,False,False,False,False,False,False,False,False,380925.0,18.26564,747750600.0,34.278445,99947.0,4.792534,144823700.0,6.639021,27782.0,6355.0,-77.125477,151918.0,31797.0,-79.06963,False,False,False,False,False,N.Novgorod,40949.0,1548.0,47400.0,1968.0,1.963535,46926860.0,2.151225,2.272866,55509680.0,2.54468,1123.398274,61.209268,70144.787549,1861.318182,2133039.0,10.624886,1315.300184,1181.609849,-10.164245,1398.947952,1746.082895,24.814
2893763,21874,03.01.2014,12,54,299.0,1.0,3,1,2014,ШКОЛА МОНСТРОВ. 13 ЖЕЛАНИЙ,40,5.0,299.0,576.0,55.0,3,12,28,1,489,2014,2015,True,489,1.177914,17,33.882353,315,17,8.0,-1.0,55.0,1.0,33,1,24,16,0.010222,False,0.02762,108671.09,0.004982,57.73,299.0,417.928287,9.0,187.491818,264.403257,-29.088688,True,False,100.0,10.0,22,-90.0,False,Cinema - DVD,False,False,False,False,False,True,False,False,False,False,False,False,False,303281.0,14.542552,82944180.0,3.802334,303281.0,14.542552,82944180.0,3.802334,136357.0,23415.0,-82.828164,136357.0,23415.0,-82.828164,True,False,False,False,False,Khimki,89455.0,8198.0,89455.0,8198.0,4.289435,77924570.0,3.572225,4.289435,77924570.0,3.572225,879.595257,193.625541,168667.910346,5590.9375,4870286.0,-13.383234,961.209063,467.050998,-51.410051,293.505914,149.0,-49.234413
2293037,16244,02.01.2014,12,14,249.0,1.0,2,1,2014,Настольная игра Шашки Для путешествий,65,1.0,249.0,301.0,35.0,2,12,30,2,557,2014,2015,True,556,0.541367,19,15.842105,200,19,6.0,0.0,46.0,1.0,50,1,12,18,0.024181,False,0.014433,76748.8,0.003518,199.2,449.0,125.401606,5.0,255.629391,395.272419,-35.328301,True,False,81.0,3.0,22,-96.296296,False,Gifts,False,False,False,False,False,False,False,False,False,False,False,True,False,383999.0,18.413041,226567000.0,10.386305,47775.0,2.290847,18018210.0,0.825992,12032.0,3613.0,-69.971742,77401.0,32212.0,-58.382967,True,False,False,False,False,Kazan,26294.0,1432.0,32528.0,2392.0,1.260817,30568750.0,1.401335,1.559742,32696810.0,1.49889,1144.32619,39.421289,45830.209895,1195.181818,1389489.0,12.68573,1415.529657,1203.796036,-14.957908,247.513433,299.0,20.801524
1411232,8658,10.06.2015,29,25,149.0,1.0,10,6,2015,БАРБОСКИНЫ ЛУЧШИЕ СЕРИИ 3 (регион),40,3.0,149.0,284.0,28.0,508,22,33,306,639,2014,2015,True,334,0.850299,12,23.666667,188,12,5.0,1.0,34.0,17.0,34,1,8,11,0.016399,False,0.013618,43229.91,0.001982,79.3,169.0,113.114754,8.0,152.240816,264.403257,-42.420976,False,False,49.0,41.0,12,-16.326531,False,Cinema - DVD,False,False,False,False,False,True,False,False,False,False,False,False,False,303281.0,14.542552,82944180.0,3.802334,303281.0,14.542552,82944180.0,3.802334,136357.0,23415.0,-82.828164,136357.0,23415.0,-82.828164,False,False,False,False,True,Moscow,139348.0,5093.0,716036.0,23312.0,6.681841,140927400.0,6.460408,34.334465,713504200.0,32.70852,950.265404,209.231231,211602.758243,6334.0,6405793.0,-6.424102,1098.049107,996.565808,-9.242146,145.081633,167.146341,15.208478
1566540,10346,15.07.2015,30,46,239.2,1.0,15,7,2015,ГАРРИ ПОТТЕР И УЗНИК АЗКАБАНА WB (BD),37,1.0,239.2,78.0,17.0,563,14,33,82,639,2014,2015,True,558,0.139785,20,3.9,55,5,3.0,1.0,24.0,1.0,466,16,10,3,0.269082,False,0.00374,23522.4,0.001078,219.0,499.0,127.853881,4.0,301.569231,428.44288,-29.612734,False,False,1.0,36.0,20,3500.0,False,Cinema - Blu-ray,False,False,False,False,False,True,False,False,False,False,False,False,False,145606.0,6.981917,73316030.0,3.36096,116758.0,5.598634,51147920.0,2.344727,45505.0,10380.0,-77.18932,56379.0,12851.0,-77.206052,True,False,False,False,False,Sergiev Posad,43133.0,1642.0,43133.0,1642.0,2.06826,46989670.0,2.154104,2.06826,46989670.0,2.154104,1065.215065,64.473842,70238.662182,1960.590909,2135894.0,4.895386,1251.668232,1179.98513,-5.727005,499.0,296.777778,-40.525495
1967958,13932,02.03.2014,14,27,149.0,1.0,2,3,2014,ЛИМБ (регион),40,1.0,149.0,52.0,15.0,63,13,29,10,507,2014,2015,True,498,0.104418,17,3.058824,43,11,4.0,1.0,15.0,1.0,133,4,2,8,0.31859,False,0.002493,7609.0,0.000349,79.0,149.0,88.607595,3.0,146.326923,264.403257,-44.65767,True,False,24.0,1.0,21,-95.833333,False,Cinema - DVD,False,False,False,False,False,True,False,False,False,False,False,False,False,303281.0,14.542552,82944180.0,3.802334,303281.0,14.542552,82944180.0,3.802334,136357.0,23415.0,-82.828164,136357.0,23415.0,-82.828164,False,False,False,True,True,Moscow,82231.0,4475.0,716036.0,39825.0,3.943038,93551750.0,4.288607,34.334465,713504200.0,32.70852,1134.321412,145.028219,164994.272698,4111.55,4677588.0,11.700525,1221.497558,690.786534,-43.447571,149.0,79.0,-46.979866
1384067,8341,27.01.2014,12,27,149.0,1.0,27,1,2014,АРИФМЕТИКА ДЛЯ МАЛЫШЕЙ (регион),40,1.0,149.0,8.0,5.0,27,12,17,2,145,2014,2014,False,144,0.055556,6,1.333333,8,4,1.0,1.0,5.0,1.0,64,3,0,2,2.553571,False,0.000384,1141.0,5.2e-05,98.0,149.0,52.040816,2.0,142.625,264.403257,-46.057775,True,False,6.0,1.0,22,-83.333333,False,Cinema - DVD,False,False,False,False,False,True,False,False,False,False,False,False,False,303281.0,14.542552,82944180.0,3.802334,303281.0,14.542552,82944180.0,3.802334,136357.0,23415.0,-82.828164,136357.0,23415.0,-82.828164,False,False,False,True,True,Moscow,82231.0,5451.0,716036.0,41374.0,3.943038,93551750.0,4.288607,34.334465,713504200.0,32.70852,1134.321412,145.028219,164994.272698,4111.55,4677588.0,11.700525,1221.497558,690.786534,-43.447571,149.0,98.0,-34.228188


In [233]:
print(transactions.columns.tolist())
len(transactions.columns)

['item_id', 'date', 'date_block_num', 'shop_id', 'item_price', 'item_cnt_day', 'day', 'month', 'year', 'item_name', 'item_category_id', 'y', 'gross', 'total_sales_units', 'block_sales_units', 'number_of_days_since_beginning', 'first_block_sale', 'last_block_sale', 'first_day_sale', 'last_day_sale', 'first_year_sale', 'last_year_sale', 'sold_two_years', 'total_days_of_sales', 'average_sales_units_day', 'total_blocks_of_sales', 'average_sales_units_block', 'number_of_days_with_a_sale', 'number_of_blocks_with_a_sale', 'max_day_sale', 'min_day_sale', 'max_block_sale', 'min_block_sale', 'max_stretch_in_days_without_sales', 'max_stretch_in_blocks_without_sales', 'max_stretch_in_days_with_sales', 'max_stretch_in_blocks_with_sales', 'average_days_between_sales', 'possibly_released_during_period', 'share_of_total_sold', 'item_gross', 'share_of_total_gross', 'item_min_price', 'item_max_price', 'price_fluctuation', 'number_of_different_prices', 'item_average_price', 'category_average_price', 'dev

112

In [168]:
print(shops.to_string())

                                          shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран                    0      
1   !Якутск ТЦ "Центральный" фран                    1      
2   Адыгея ТЦ "Мега"                                 2      
3   Балашиха ТРК "Октябрь-Киномир"                   3      
4   Волжский ТЦ "Волга Молл"                         4      
5   Вологда ТРЦ "Мармелад"                           5      
6   Воронеж (Плехановская, 13)                       6      
7   Воронеж ТРЦ "Максимир"                           7      
8   Воронеж ТРЦ Сити-Парк "Град"                     8      
9   Выездная Торговля                                9      
10  Жуковский ул. Чкалова 39м?                       10     
11  Жуковский ул. Чкалова 39м²                       11     
12  Интернет-магазин ЧС                              12     
13  Казань ТЦ "Бехетле"                              13     
14  Казань ТЦ "ПаркХаус" II                          14     
15  Калуга ТРЦ "XXI век"