In [207]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [208]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb

In [209]:
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [210]:
transactions = sales_train
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)


In [211]:
transactions = transactions.set_index('item_id').join(items.set_index('item_id'))
transactions.reset_index(inplace=True)
transactions = transactions[transactions['year'] != 2013]
transactions['y'] = transactions.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].transform('sum').clip(0,20)


In [212]:
print(len(transactions))
transactions.head()

1668287


Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y
0,0,01.09.2014,20,54,58.0,1.0,1,9,2014,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,1.0
1,1,04.04.2014,15,55,4490.0,1.0,4,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
2,1,02.04.2014,15,55,4490.0,1.0,2,4,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,2.0
3,1,06.07.2014,18,55,4490.0,1.0,6,7,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0
4,1,04.08.2014,19,55,4490.0,1.0,4,8,2014,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,1.0


In [213]:
len(transactions)

1668287

In [242]:
transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']

In [243]:
number_of_items = transactions['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = transactions['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = transactions['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = transactions['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = transactions['item_cnt_day'].sum()
print("total_sales:", total_sales)
total_gross = transactions['gross'].sum()
print("total_gross:", total_gross)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
total_gross: 2181401610.589987


In [216]:
transactions['total_sales_units'] = transactions.groupby(['item_id'])['item_cnt_day'].transform(np.sum)
transactions['block_sales_units'] = transactions.groupby(['item_id','date_block_num'])['item_cnt_day'].transform(np.sum)

In [217]:
def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

transactions['number_of_days_since_beginning'] = transactions.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1)

In [218]:
transactions['first_block_sale'] = transactions.groupby(['item_id'])['date_block_num'].transform(np.min)
transactions['last_block_sale'] = transactions.groupby(['item_id'])['date_block_num'].transform(np.max)
transactions['first_day_sale'] = transactions.groupby(['item_id'])['number_of_days_since_beginning'].transform(np.min)
transactions['last_day_sale'] = transactions.groupby(['item_id'])['number_of_days_since_beginning'].transform(np.max)
transactions['first_year_sale'] = transactions.groupby(['item_id'])['year'].transform(np.min)
transactions['last_year_sale'] = transactions.groupby(['item_id'])['year'].transform(np.max)

In [249]:
transactions['sold_two_years'] = transactions['last_year_sale'] > transactions['first_year_sale']

In [219]:
transactions['total_days_of_sales'] = transactions['last_day_sale'] - transactions['first_day_sale'] + 1
transactions['average_sales_units_day'] = transactions['total_sales_units'] / transactions['total_days_of_sales']

In [220]:
transactions['total_blocks_of_sales'] = transactions['last_block_sale'] - transactions['first_block_sale'] + 1
transactions['average_sales_units_block'] = transactions['total_sales_units'] / transactions['total_blocks_of_sales']

In [221]:
transactions.sample(10)

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y,total_sales_units,block_sales_units,number_of_days_since_beginning,first_block_sale,last_block_sale,first_day_sale,last_day_sale,first_year_sale,last_year_sale,total_days_of_sales,average_sales_units_day,total_blocks_of_sales,average_sales_units_block
1577350,10429,14.08.2015,31,2,449.0,1.0,14,8,2015,ГНЕЗДО ДРАКОНА,40,1.0,232.0,22.0,565,28,33,488,634,2015,2015,147,1.578231,6,38.666667
1379456,8293,16.12.2014,23,43,149.0,1.0,16,12,2014,АНГЛИЙСКИЙ ПАЦИЕНТ (BD),37,1.0,148.0,30.0,320,12,29,2,502,2014,2015,501,0.295409,18,8.222222
2516857,18315,22.08.2014,19,56,299.0,1.0,22,8,2014,СБ. Дискотека 80-х (mp3-CD) (jewel),57,2.0,202.0,13.0,192,12,33,1,636,2014,2015,636,0.31761,22,9.181818
2899429,21902,18.04.2015,27,48,99.0,1.0,18,4,2015,ШРЭК (BD),37,1.0,155.0,9.0,439,12,33,3,638,2014,2015,636,0.243711,22,7.045455
1816030,12600,01.02.2014,13,22,149.0,1.0,1,2,2014,КНИГА ДЖУНГЛЕЙ М/Ф (регион),40,1.0,317.0,25.0,30,12,31,1,577,2014,2015,577,0.549393,20,15.85
1005165,6027,24.12.2014,23,3,299.0,1.0,24,12,2014,RED HOT CHILI PEPPERS Californication,55,1.0,486.0,46.0,312,12,33,3,638,2014,2015,636,0.764151,22,22.090909
2146584,15256,28.09.2015,32,2,399.0,1.0,28,9,2015,Мягкая игрушка Angry Birds Красная птица 30см арт. АВР12,63,5.0,542.0,154.0,637,12,33,1,639,2014,2015,639,0.8482,22,24.636364
707442,4351,03.01.2015,24,25,399.0,1.0,3,1,2015,"LEGO Marvel Super Heroes [PC, Jewel, русские субтитры]",30,7.0,3033.0,128.0,368,12,33,1,639,2014,2015,639,4.746479,22,137.863636
704616,4349,19.09.2015,32,57,1399.0,1.0,19,9,2015,LEGO Indiana Jones 2: The Adventure Continues [Xbox 360],23,3.0,273.0,31.0,628,26,33,428,639,2015,2015,212,1.287736,8,34.125
131562,1496,10.09.2014,20,48,2199.0,1.0,10,9,2014,"Assassin's Creed IV. Черный флаг [PS4, русская версия]",20,1.0,1038.0,73.0,254,16,33,123,639,2014,2015,517,2.007737,18,57.666667


In [222]:
number_of_days_with_a_sale = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby('item_id').size()

In [223]:
transactions['number_of_days_with_a_sale'] = transactions['item_id'].map(number_of_days_with_a_sale)

In [224]:
number_of_blocks_with_a_sale = transactions.groupby(['item_id', 'date_block_num'], as_index=False).first().groupby('item_id').size()
transactions['number_of_blocks_with_a_sale'] = transactions['item_id'].map(number_of_blocks_with_a_sale)

In [225]:
max_day_sale = transactions.groupby(['item_id', 'date'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].max()
transactions['max_day_sale'] = transactions['item_id'].map(max_day_sale)

min_day_sale = transactions.groupby(['item_id', 'date'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].min()
transactions['min_day_sale'] = transactions['item_id'].map(min_day_sale)


In [226]:
max_block_sale = transactions.groupby(['item_id', 'date_block_num'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].max()
transactions['max_block_sale'] = transactions['item_id'].map(max_block_sale)

min_block_sale = transactions.groupby(['item_id', 'date_block_num'],as_index=False)['item_cnt_day'].sum().groupby(['item_id'])['item_cnt_day'].min()
transactions['min_block_sale'] = transactions['item_id'].map(min_block_sale)


In [233]:
gc.collect()

def get_max_stretch_without_sales_days(days):
    days = np.unique(days)
    max_stretch = 0
    len_days = len(days)
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return max_stretch
        next_day = days[index+1]
        stretch = next_day - day
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_days(x))

transactions['max_stretch_in_days_without_sales'] = transactions['item_id'].map(max_stretch_without_sales)

In [234]:
gc.collect()

def get_max_stretch_without_sales_block(blocks):
    blocks = np.unique(blocks)
    max_stretch = 0
    len_blocks = len(blocks)
    for index,block in enumerate(sorted(blocks)):
        if index == len_blocks - 1:
            return max_stretch
        next_block = blocks[index+1]
        stretch = next_block - block
        if stretch > max_stretch:
            max_stretch = stretch
            

        
max_stretch_without_sales_block = transactions.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_max_stretch_without_sales_block(x))

transactions['max_stretch_in_blocks_without_sales'] = transactions['item_id'].map(max_stretch_without_sales_block)

In [229]:
def get_following_pairs(days):
    days = np.unique(days)
    len_days = len(days)
    following = []
    for index,day in enumerate(sorted(days)):
        if index == len_days - 1:
            return following
        next_day = days[index+1]
        if next_day == day + 1:
            following.append([day, next_day])
        
assert(get_following_pairs([1,2,5,6,7,8,9,11,12,15]) == [[1, 2], [5, 6], [6, 7], [7, 8], [8, 9], [11, 12]])
assert(get_following_pairs([1,2,5,6,7,10]) == [[1, 2], [5, 6], [6, 7]])
assert(get_following_pairs([1,2,4,5,7,9,10]) == [[1, 2], [4, 5], [9,10]])
assert(get_following_pairs([1,2,4,5,7,9,10,11,12,15]) == [[1, 2], [4, 5], [9,10],[10,11],[11,12]])





def get_longest_stretch(following_pairs, n=1):
    len_pairs = len(following_pairs)
    if len_pairs == 0:
        return 0
    if len_pairs == 1:
        return n
    if following_pairs[1][0] == following_pairs[0][1]:
        n+=1
    return get_longest_stretch(following_pairs[1:], n)


assert(get_longest_stretch([]) == 0)
assert(get_longest_stretch([[1, 2], [2,3], [3, 4], [4,5] ,[8,9], [11, 12]]) == 4)
assert(get_longest_stretch([[1, 2], [4,5], [7, 8]]) == 1)
assert(get_longest_stretch([[1, 2], [5, 6], [6, 7], [7,8], [14, 15]]) == 3)

gc.collect()

max_stretch_with_sales_days = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions['max_stretch_in_days_with_sales'] = transactions['item_id'].map(max_stretch_with_sales_days)

gc.collect()

max_stretch_with_sales_blocks = transactions.groupby(['item_id'])['date_block_num']\
                                    .apply(list).apply(lambda x: get_longest_stretch(get_following_pairs(x)))

transactions['max_stretch_in_blocks_with_sales'] = transactions['item_id'].map(max_stretch_with_sales_blocks)


In [230]:
def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = transactions.groupby(['item_id', 'date'], as_index=False).first().groupby(['item_id'])['number_of_days_since_beginning']\
                                    .apply(list).apply(lambda x: get_average_days_between_sales(x))

transactions['average_days_between_sales'] = transactions['item_id'].map(average_days_between_sales)

In [231]:
transactions['possibly_released_during_period'] = ((transactions['max_stretch_in_blocks_without_sales'] > 8) & (transactions['number_of_days_with_a_sale'] > 200)).astype(bool)

In [244]:
transactions['share_of_total_sold'] = transactions['total_sales_units'] / total_sales * 100

transactions['item_gross'] = transactions.groupby('item_id')['gross'].transform(np.sum)
transactions['share_of_total_gross'] = transactions['item_gross'] / total_gross * 100

In [292]:
transactions['item_min_price'] = transactions.groupby('item_id')['item_price'].transform(np.min)
transactions['item_max_price'] = transactions.groupby('item_id')['item_price'].transform(np.max)
transactions['price_fluctuation'] = ((transactions['item_max_price'] - transactions['item_min_price'] ) / transactions['item_min_price']) * 100
transactions['number_of_different_prices'] = transactions.groupby('item_id')['item_price'].transform('nunique')
transactions['item_average_price'] = transactions.groupby('item_id')['item_price'].transform(np.mean)



In [288]:
transactions['category_average_price'] = transactions.groupby(['item_category_id'])['item_price'].transform(np.mean)
transactions['deviation_category_price'] =  ((transactions['item_average_price'] - transactions['category_average_price'] ) / transactions['category_average_price']) * 100

In [313]:
np.array_split(list(range(27,29)),4)

[array([27]), array([28]), array([], dtype=int32), array([], dtype=int32)]

In [318]:

def is_first_two_blocks(first_block_sale, block):
    return block in [first_block_sale, first_block_sale+1]
def is_last_two_blocks(last_block_sale, block):
    return block in [last_block_sale - 1, last_block_sale]


transactions['is_first_two_blocks'] = transactions.apply(lambda row: is_first_two_blocks(row['first_block_sale'], row['date_block_num']),axis=1)
transactions['is_last_two_blocks'] = transactions.apply(lambda row: is_last_two_blocks(row['last_block_sale'], row['date_block_num']),axis=1)

In [346]:
a = transactions.groupby(['item_id','is_first_two_blocks'], as_index=False)['item_cnt_day'].sum()
b = a[a['is_first_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['first_two_blocks_sales_units'] = transactions['item_id'].map(b)

c = transactions.groupby(['item_id','is_last_two_blocks'], as_index=False)['item_cnt_day'].sum()
d = c[c['is_last_two_blocks'] == True].set_index('item_id').iloc[:,1]

transactions['last_two_blocks_sales_units'] = transactions['item_id'].map(d)

In [358]:
transactions['number_of_blocks_between_target_and_first'] = 34 - transactions['first_block_sale']
transactions['fluctuation_first_last_two_blocks'] =  ((transactions['first_two_blocks_sales_units'] - transactions['last_two_blocks_sales_units'] ) / transactions['first_two_blocks_sales_units']) * 100

In [361]:
transactions['new_release'] = 34 - transactions['first_block_sale'] < 4

In [365]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
transactions.sample(10).sort_values(by=['total_sales_units'], ascending=False)

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_name,item_category_id,y,total_sales_units,block_sales_units,number_of_days_since_beginning,first_block_sale,last_block_sale,first_day_sale,last_day_sale,first_year_sale,last_year_sale,total_days_of_sales,average_sales_units_day,total_blocks_of_sales,average_sales_units_block,number_of_days_with_a_sale,number_of_blocks_with_a_sale,max_day_sale,min_day_sale,max_block_sale,min_block_sale,max_stretch_in_days_without_sales,max_stretch_in_blocks_without_sales,max_stretch_in_days_with_sales,max_stretch_in_blocks_with_sales,average_days_between_sales,possibly_released_during_period,share_of_total_sold,gross,item_gross,share_of_total_gross,sold_two_years,item_min_price,item_max_price,price_fluctuation,number_of_different_prices,item_average_price,category_average_price,deviation_category_price,is_first_two_blocks,is_last_two_blocks,first_two_blocks_sales_units,last_two_blocks_sales_units,number_of_blocks_between_target_and_first,fluctuation_first_last_two_blocks,new_release
2795072,20949,19.03.2014,14,43,5.0,2.0,19,3,2014,Фирменный пакет майка 1С Интерес белый (34*42) 45 мкм,71,20.0,123894.0,7717.0,80,12,33,1,639,2014,2015,639,193.887324,22,5631.545455,669,22,1137.0,24.0,12185.0,1924.0,31,1,322,21,0.005439,False,5.940811,10.0,612969.1,0.0281,True,1.83,5.0,173.224044,63.0,4.928906,4.941619,-0.257266,False,False,13876.0,7468.0,22,46.180455,False
1141388,6675,02.07.2014,18,35,22990.0,1.0,2,7,2014,Sony PlayStation 4 (500 Gb) Black (CUH-1008A/1108A/B01),12,16.0,6523.0,392.0,185,12,33,1,631,2014,2015,631,10.337559,22,296.5,566,22,164.0,-1.0,1359.0,2.0,35,1,275,21,0.006663,False,0.312783,22990.0,140782000.0,6.453742,True,14992.0,28990.0,93.369797,135.0,22050.054735,22926.667546,-3.823551,False,False,1642.0,4.0,22,99.756395,False
1869407,13071,02.05.2015,28,2,599.0,1.0,2,5,2015,Кабель универсальный HDMI Artplays 2 м (ver. 1.4),3,1.0,2559.0,23.0,489,12,32,2,637,2014,2015,636,4.023585,21,121.857143,485,21,33.0,1.0,330.0,2.0,35,1,237,20,0.008071,False,0.122706,599.0,1484273.0,0.068042,True,228.39,599.0,162.270677,13.0,579.609746,2296.794544,-74.764406,False,False,256.0,12.0,22,95.3125,False
1761864,12090,06.01.2015,24,31,399.0,3.0,6,1,2015,ИСЧЕЗНУВШАЯ,40,20.0,1093.0,556.0,371,23,33,305,639,2014,2015,335,3.262687,11,99.363636,205,11,53.0,1.0,556.0,13.0,61,1,93,10,0.019315,True,0.05241,1197.0,433453.7,0.01987,True,221.7,399.0,79.972936,14.0,395.987565,264.403257,49.766523,True,False,602.0,32.0,11,94.684385,False
1424556,8743,06.07.2015,30,26,169.0,1.0,6,7,2015,БЕЛАЯ ГВАРДИЯ (регион),40,1.0,488.0,19.0,554,12,33,1,639,2014,2015,639,0.763693,22,22.181818,312,22,6.0,1.0,41.0,9.0,37,1,110,21,0.011504,False,0.0234,169.0,74644.04,0.003422,True,46.43,169.0,263.9888,10.0,153.056807,264.403257,-42.112359,False,False,23.0,19.0,22,17.391304,False
1005139,6027,26.12.2014,23,57,299.0,1.0,26,12,2014,RED HOT CHILI PEPPERS Californication,55,1.0,486.0,46.0,310,12,33,3,638,2014,2015,636,0.764151,22,22.090909,326,22,5.0,1.0,46.0,8.0,33,1,128,21,0.010888,False,0.023304,299.0,144736.0,0.006635,True,183.0,299.0,63.387978,6.0,297.805785,296.502497,0.439554,False,False,19.0,34.0,22,-78.947368,False
2533170,18452,22.04.2014,15,50,199.0,1.0,22,4,2014,СБ. Союз 53,55,1.0,321.0,39.0,70,12,26,2,431,2014,2015,430,0.746512,15,21.4,165,14,7.0,1.0,76.0,1.0,49,2,74,11,0.032159,False,0.015392,199.0,60904.6,0.002792,True,110.0,199.0,80.909091,5.0,189.508553,296.502497,-36.085343,False,False,131.0,3.0,22,97.709924,False
2754254,20604,07.03.2014,14,54,1999.0,1.0,7,3,2014,"Фигурка Minecraft Creeper Vinyl 6""",72,4.0,166.0,11.0,68,12,33,3,639,2014,2015,637,0.260597,22,7.545455,133,19,6.0,1.0,26.0,2.0,99,4,31,16,0.044538,False,0.00796,1999.0,299295.6,0.01372,True,830.0,1999.0,140.843373,6.0,1808.13475,1342.537526,34.680388,False,False,17.0,31.0,22,-82.352941,False
2372774,16868,13.06.2014,17,31,399.0,1.0,13,6,2014,ПАПУА 3D - СЕКРЕТНЫЙ ОСТРОВ КАННИБАЛОВ (3D BD),38,1.0,65.0,6.0,140,12,28,3,500,2014,2015,498,0.130522,17,3.823529,59,16,2.0,-1.0,12.0,1.0,59,2,2,14,0.187406,False,0.003117,399.0,31339.0,0.001437,True,99.0,749.0,656.565657,7.0,490.104478,754.206261,-35.017183,False,False,21.0,2.0,22,90.47619,False
1798427,12447,08.01.2014,12,17,349.0,1.0,8,1,2014,КЕЙТ И ЛЕО,40,1.0,61.0,10.0,8,12,26,2,436,2014,2015,435,0.14023,15,4.066667,52,13,4.0,1.0,12.0,1.0,97,3,4,10,0.20074,False,0.002925,349.0,16175.0,0.000741,True,49.0,349.0,612.244898,5.0,265.163934,264.403257,0.287696,True,False,20.0,2.0,22,90.0,False


In [366]:
transactions.columns

Index(['item_id', 'date', 'date_block_num', 'shop_id', 'item_price',
       'item_cnt_day', 'day', 'month', 'year', 'item_name', 'item_category_id',
       'y', 'total_sales_units', 'block_sales_units',
       'number_of_days_since_beginning', 'first_block_sale', 'last_block_sale',
       'first_day_sale', 'last_day_sale', 'first_year_sale', 'last_year_sale',
       'total_days_of_sales', 'average_sales_units_day',
       'total_blocks_of_sales', 'average_sales_units_block',
       'number_of_days_with_a_sale', 'number_of_blocks_with_a_sale',
       'max_day_sale', 'min_day_sale', 'max_block_sale', 'min_block_sale',
       'max_stretch_in_days_without_sales',
       'max_stretch_in_blocks_without_sales', 'max_stretch_in_days_with_sales',
       'max_stretch_in_blocks_with_sales', 'average_days_between_sales',
       'possibly_released_during_period', 'share_of_total_sold', 'gross',
       'item_gross', 'share_of_total_gross', 'sold_two_years',
       'item_min_price', 'item_max_price',