# Librerías

In [128]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import seaborn as sns

%matplotlib inline

# Datos

In [129]:
train = pd.read_csv('../data/prepro_train_pw.csv')
test = pd.read_csv('../data/prepro_test_pw.csv')

In [130]:
train.head()

Unnamed: 0,week,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,price,sales
0,0,310130,1726,1383,34.811328,679611953,533441312,7,1,3.0,3,12.95,33
1,0,1178388,592,60,2.160247,732697347,691762817,4,1,19.0,1,49.95,0
2,0,1561460,1625,2373,55.438769,396066037,520569701,5,1,38.0,3,29.95,21
3,0,1874414,1135,1686,20.463906,744793598,811402796,6,1,12.0,6,25.95,24
4,0,2436420,779,245,23.377339,768025921,665805124,5,1,,0,25.95,0


In [131]:
test.head()

Unnamed: 0,week,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,price,sales
0,11,151926,1969,2566,31.975592,396066037,335531561,5,1,5.0,2,25.95,-1
1,11,213413,1648,4951,37.013401,552529755,11509337,7,1,57.0,2,19.95,-1
2,11,310130,1726,3108,49.403947,679611953,533441312,7,1,46.0,1,12.95,-1
3,11,455200,1400,348,6.093475,998145072,490222156,3,1,53.0,2,29.95,-1
4,11,571044,1098,1741,41.099558,831347344,750943270,4,2,154.0,2,15.95,-1


In [132]:
test.tail()

Unnamed: 0,week,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,price,sales
8887,11,999772605,1912,1585,22.077884,775013441,383177575,1,2,9.0,6,15.95,-1
8888,11,999794342,1015,18,0.504525,775013441,957377364,1,1,9.0,5,29.95,-1
8889,11,999816749,296,1283,16.369746,759754297,395002630,5,1,42.0,2,5.95,-1
8890,11,999862351,1794,950,4.288111,396066037,943122287,4,2,176.0,2,25.95,-1
8891,11,999936664,2218,1565,7.317276,396066037,739564025,8,1,31.0,1,15.95,-1


In [133]:
test[test.product_id == 151926]

Unnamed: 0,week,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,price,sales
0,11,151926,1969,2566,31.975592,396066037,335531561,5,1,5.0,2,25.95,-1


#### Frequency encoding

Para las categorias `family_id` y `subfamily_id`

In [134]:
train = train.drop('week', axis=1)
test = test.drop('week', axis=1)

In [135]:
encoding_train = train.groupby('family_id').size()
encoding_train = encoding_train/len(train)
train['family_id'] = train.family_id.map(encoding_train)
encoding_train = train.groupby('subfamily_id').size()
encoding_train = encoding_train/len(train)
train['subfamily_id'] = train.subfamily_id.map(encoding_train)

encoding_test = test.groupby('family_id').size()
encoding_test = encoding_test/len(test)
test['family_id'] = test.family_id.map(encoding_test)
encoding_test = test.groupby('subfamily_id').size()
encoding_test = encoding_test/len(test)
test['subfamily_id'] = test.subfamily_id.map(encoding_test)

In [136]:
train['stock_lag1'] = train.groupby(['product_id'])['stock'].shift(1)
test['stock_lag1'] = test.groupby(['product_id'])['stock'].shift(1)
train['stock_lead1'] = train.groupby(['product_id'])['stock'].shift(-1)
test['stock_lead1'] = test.groupby(['product_id'])['stock'].shift(-1)

train['std_stock_shift1'] = train[['stock', 'stock_lag1', 'stock_lead1']].std(axis=1)
test['std_stock_shift1'] = test[['stock', 'stock_lag1', 'stock_lead1']].std(axis=1)

train['mean_stock_shift1'] = train[['stock', 'stock_lag1', 'stock_lead1']].mean(axis=1)
test['mean_stock_shift1'] = test[['stock', 'stock_lag1', 'stock_lead1']].mean(axis=1)

train['min_stock_shift1'] = train[['stock', 'stock_lag1', 'stock_lead1']].min(axis=1)
test['min_stock_shift1'] = test[['stock', 'stock_lag1', 'stock_lead1']].min(axis=1)

train['max_stock_shift1'] = train[['stock', 'stock_lag1', 'stock_lead1']].max(axis=1)
test['max_stock_shift1'] = test[['stock', 'stock_lag1', 'stock_lead1']].max(axis=1)

train['median_stock_shift1'] = train[['stock', 'stock_lag1', 'stock_lead1']].median(axis=1)
test['median_stock_shift1'] = test[['stock', 'stock_lag1', 'stock_lead1']].median(axis=1)

train['stock_lag2'] = train.groupby(['product_id'])['stock'].shift(2)
test['stock_lag2'] = test.groupby(['product_id'])['stock'].shift(2)
train['stock_lead2'] = train.groupby(['product_id'])['stock'].shift(-2)
test['stock_lead2'] = test.groupby(['product_id'])['stock'].shift(-2)

train['std_stock_shift2'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].std(axis=1)
test['std_stock_shift2'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].std(axis=1)

train['min_stock_shift2'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].min(axis=1)
test['min_stock_shift2'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].min(axis=1)

train['max_stock_shift2'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].max(axis=1)
test['max_stock_shift2'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].max(axis=1)

train['median_stock_shift2'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].median(axis=1)
test['median_stock_shift2'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].median(axis=1)

train['mean_stock_shift2'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].mean(axis=1)
test['mean_stock_shift2'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2']].mean(axis=1)

In [137]:
train['pos_lag1'] = train.groupby(['product_id'])['position'].shift(1)
test['pos_lag1'] = test.groupby(['product_id'])['position'].shift(1)
train['pos_lead1'] = train.groupby(['product_id'])['position'].shift(-1)
test['pos_lead1'] = test.groupby(['product_id'])['position'].shift(-1)

train['std_pos_shift1'] = train[['position', 'pos_lag1', 'pos_lead1']].std(axis=1)
test['std_pos_shift1'] = test[['position', 'pos_lag1', 'pos_lead1']].std(axis=1)

train['mean_pos_shift1'] = train[['position', 'pos_lag1', 'pos_lead1']].mean(axis=1)
test['mean_pos_shift1'] = test[['position', 'pos_lag1', 'pos_lead1']].mean(axis=1)

train['min_pos_shift1'] = train[['position', 'pos_lag1', 'pos_lead1']].min(axis=1)
test['min_pos_shift1'] = test[['position', 'pos_lag1', 'pos_lead1']].min(axis=1)

train['max_pos_shift1'] = train[['position', 'pos_lag1', 'pos_lead1']].max(axis=1)
test['max_pos_shift1'] = test[['position', 'pos_lag1', 'pos_lead1']].max(axis=1)

train['median_pos_shift1'] = train[['position', 'pos_lag1', 'pos_lead1']].median(axis=1)
test['median_pos_shift1'] = test[['position', 'pos_lag1', 'pos_lead1']].median(axis=1)

train['pos_lag2'] = train.groupby(['product_id'])['position'].shift(2)
test['pos_lag2'] = test.groupby(['product_id'])['position'].shift(2)
train['pos_lead2'] = train.groupby(['product_id'])['position'].shift(-2)
test['pos_lead2'] = test.groupby(['product_id'])['position'].shift(-2)

train['std_pos_shift2'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].std(axis=1)
test['std_pos_shift2'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].std(axis=1)

train['min_pos_shift2'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].min(axis=1)
test['min_pos_shift2'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].min(axis=1)

train['max_pos_shift2'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].max(axis=1)
test['max_pos_shift2'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].max(axis=1)

train['median_pos_shift2'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].median(axis=1)
test['median_pos_shift2'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].median(axis=1)

train['mean_pos_shift2'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].mean(axis=1)
test['mean_pos_shift2'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2']].mean(axis=1)

In [138]:
train['diff_stock_lead1'] = train.stock - train.stock_lead1
train['diff_stock_lead2'] = train.stock - train.stock_lead2
train['diff_stock_lag1'] = train.stock - train.stock_lag1
train['diff_stock_lag2'] = train.stock - train.stock_lag2

test['diff_stock_lead1'] = test.stock - test.stock_lead1
test['diff_stock_lead2'] = test.stock - test.stock_lead2
test['diff_stock_lag1'] = test.stock - test.stock_lag1
test['diff_stock_lag2'] = test.stock - test.stock_lag2

In [139]:
train['diff_pos_lead1'] = train.position - train.pos_lead1
train['diff_pos_lead2'] = train.position - train.pos_lead2
train['diff_pos_lag1'] = train.position - train.pos_lag1
train['diff_pos_lag2'] = train.position - train.pos_lag2

test['diff_pos_lead1'] = test.position - test.pos_lead1
test['diff_pos_lead2'] = test.position - test.pos_lead2
test['diff_pos_lag1'] = test.position - test.pos_lag1
test['diff_pos_lag2'] = test.position - test.pos_lag2

In [140]:
train['std_stock_shift3'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].std(axis=1)
test['std_stock_shift3'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].std(axis=1)

train['mean_stock_shift3'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].mean(axis=1)
test['mean_stock_shift3'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].mean(axis=1)

train['min_stock_shift3'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].min(axis=1)
test['min_stock_shift3'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].min(axis=1)

train['max_stock_shift3'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].max(axis=1)
test['max_stock_shift3'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].max(axis=1)

train['median_stock_shift3'] = train[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].median(axis=1)
test['median_stock_shift3'] = test[['stock', 'stock_lag1', 'stock_lead1',
                                   'stock_lag2', 'stock_lead2', 'diff_stock_lead1',
                                   'diff_stock_lead2', 'diff_stock_lag1', 'diff_stock_lag2']].median(axis=1)

In [141]:
train['std_pos_shift3'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].std(axis=1)
test['std_pos_shift3'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].std(axis=1)

train['mean_pos_shift3'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].mean(axis=1)
test['mean_pos_shift3'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].mean(axis=1)

train['min_pos_shift3'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].min(axis=1)
test['min_pos_shift3'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].min(axis=1)

train['max_pos_shift3'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].max(axis=1)
test['max_pos_shift3'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].max(axis=1)

train['median_pos_shift3'] = train[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].median(axis=1)
test['median_pos_shift3'] = test[['position', 'pos_lag1', 'pos_lead1',
                                   'pos_lag2', 'pos_lead2', 'diff_pos_lead1',
                                   'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2']].median(axis=1)

In [142]:
train['size_p_color'] = train.size_id * train.color_id
test['size_p_color'] = test.size_id * test.color_id

In [143]:
train['size_d_color'] = train.size_id / train.color_id
test['size_d_color'] = test.size_id / test.color_id

In [144]:
train['ratio_position'] = train.position / train.position.max()
test['ratio_position'] = test.position / test.position.max()

#### Ratios

*desctivado*

In [145]:
# def gen_ratios(df, c):
#     for col in c:
#         df['ratio_{}'.format(col)] = train[col] / train[col].max()
        
#     return df

In [146]:
# cols_ratios = ['median_stock_shift3', 'mean_stock_shift3', 'max_stock_shift3', 'min_stock_shift3',
#               'median_stock_shift2', 'mean_stock_shift2', 'max_stock_shift2', 'min_stock_shift2',
#               'median_stock_shift1', 'mean_stock_shift1', 'max_stock_shift1', 'min_stock_shift1',
#                'median_pos_shift3', 'mean_pos_shift3', 'max_pos_shift3', 'min_pos_shift3',
#               'median_pos_shift2', 'mean_pos_shift2', 'max_pos_shift2', 'min_pos_shift2',
#               'median_pos_shift1', 'mean_pos_shift1', 'max_pos_shift1', 'min_pos_shift1']

# train = gen_ratios(train, cols_ratios)
# test = gen_ratios(test, cols_ratios)

In [147]:
train.head()

Unnamed: 0,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,...,max_stock_shift3,median_stock_shift3,std_pos_shift3,mean_pos_shift3,min_pos_shift3,max_pos_shift3,median_pos_shift3,size_p_color,size_d_color,ratio_position
0,310130,1726,1383,34.811328,0.007081,0.000583,7,1,3.0,3,...,14229.0,1383.0,5.357238,1.8,-6.0,9.0,2.0,7,7.0,0.003812
1,1178388,592,60,2.160247,0.049343,0.012578,4,1,19.0,1,...,564.0,60.0,60.977865,11.4,-76.0,95.0,19.0,4,4.0,0.024142
2,1561460,1625,2373,55.438769,0.123499,0.023558,5,1,38.0,3,...,16089.0,2373.0,51.001961,22.8,-44.0,82.0,38.0,5,5.0,0.048285
3,1874414,1135,1686,20.463906,0.11092,0.016119,6,1,12.0,6,...,12566.0,1686.0,7.049823,7.2,-1.0,13.0,12.0,6,6.0,0.015248
4,2436420,779,245,23.377339,0.025814,0.004422,5,1,,0,...,1743.0,245.0,,,,,,5,5.0,


In [148]:
# max_std_train1 = train.groupby('product_id').max()['std_stock_shift1']
# max_std_train1.name = 'max_std_stock_shift1'
# max_std_test1 = test.groupby('product_id').max()['std_stock_shift1']
# max_std_test1.name = 'max_std_stock_shift1'

# train = train.join(max_std_train1, on='product_id', how='left')
# test = test.join(max_std_test1, on='product_id', how='left')

# train['ratio_std_shift1'] = train.std_stock_shift1 / train.max_std_stock_shift1
# test['ratio_std_shift1'] = test.std_stock_shift1 / test.max_std_stock_shift1

# train = train.drop('max_std_stock_shift1', axis=1)
# test = test.drop('max_std_stock_shift1', axis=1)

# max_std_train2 = train.groupby('product_id').max()['std_stock_shift2']
# max_std_train2.name = 'max_std_stock_shift2'
# max_std_test2 = test.groupby('product_id').max()['std_stock_shift2']
# max_std_test2.name = 'max_std_stock_shift2'

# train = train.join(max_std_train2, on='product_id', how='left')
# test = test.join(max_std_test2, on='product_id', how='left')

# train['ratio_std_shift2'] = train.std_stock_shift2 / train.max_std_stock_shift2
# test['ratio_std_shift2'] = test.std_stock_shift2 / test.max_std_stock_shift2

# train = train.drop('max_std_stock_shift2', axis=1)
# test = test.drop('max_std_stock_shift2', axis=1)

In [149]:
train.head()

Unnamed: 0,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,...,max_stock_shift3,median_stock_shift3,std_pos_shift3,mean_pos_shift3,min_pos_shift3,max_pos_shift3,median_pos_shift3,size_p_color,size_d_color,ratio_position
0,310130,1726,1383,34.811328,0.007081,0.000583,7,1,3.0,3,...,14229.0,1383.0,5.357238,1.8,-6.0,9.0,2.0,7,7.0,0.003812
1,1178388,592,60,2.160247,0.049343,0.012578,4,1,19.0,1,...,564.0,60.0,60.977865,11.4,-76.0,95.0,19.0,4,4.0,0.024142
2,1561460,1625,2373,55.438769,0.123499,0.023558,5,1,38.0,3,...,16089.0,2373.0,51.001961,22.8,-44.0,82.0,38.0,5,5.0,0.048285
3,1874414,1135,1686,20.463906,0.11092,0.016119,6,1,12.0,6,...,12566.0,1686.0,7.049823,7.2,-1.0,13.0,12.0,6,6.0,0.015248
4,2436420,779,245,23.377339,0.025814,0.004422,5,1,,0,...,1743.0,245.0,,,,,,5,5.0,


In [150]:
train[train.product_id==151926]

Unnamed: 0,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,...,max_stock_shift3,median_stock_shift3,std_pos_shift3,mean_pos_shift3,min_pos_shift3,max_pos_shift3,median_pos_shift3,size_p_color,size_d_color,ratio_position
58550,151926,1969,5372,27.050797,0.123499,0.031162,5,1,62.0,4,...,5372.0,5372.0,,62.0,62.0,62.0,62.0,5,5.0,0.07878


#### Frequency encoding por grupos

Útil para series temporales

In [151]:
agrupacion = ['subfamily_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N1'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N1'
test = test.join(a, on=agrupacion, how='left')

In [152]:
agrupacion = ['subfamily_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N2'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N2'
test = test.join(a, on=agrupacion, how='left')

In [153]:
agrupacion = ['subfamily_id', 'family_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N3'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N3'
test = test.join(a, on=agrupacion, how='left')

In [154]:
agrupacion = ['product_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N5'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N5'
test = test.join(a, on=agrupacion, how='left')

In [155]:
agrupacion = ['product_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N6'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N6'
test = test.join(a, on=agrupacion, how='left')

In [156]:
agrupacion = ['size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N7'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N7'
test = test.join(a, on=agrupacion, how='left')

In [157]:
agrupacion = ['product_id']
a = train.groupby(agrupacion)['family_id'].count()
a.name = 'N8'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['family_id'].count()
a.name = 'N8'
test = test.join(a, on=agrupacion, how='left')

In [158]:
agrupacion = ['price', 'family_id', 'subfamily_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N9'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N9'
test = test.join(a, on=agrupacion, how='left')

In [159]:
agrupacion = ['price', 'family_id', 'subfamily_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N10'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N10'
test = test.join(a, on=agrupacion, how='left')

In [160]:
agrupacion = ['block_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N12'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N12'
test = test.join(a, on=agrupacion, how='left')

In [161]:
agrupacion = ['family_id', 'subfamily_id', 'size_id', 'color_id']
a = train.groupby(agrupacion)['product_id'].count()
a.name = 'N13'
train = train.join(a, on=agrupacion, how='left')

a = test.groupby(agrupacion)['product_id'].count()
a.name = 'N13'
test = test.join(a, on=agrupacion, how='left')

In [162]:
# n = 'N12'
# train = train.drop(n, axis=1)
# test = test.drop(n, axis=1)

In [163]:
train.head()

Unnamed: 0,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position,category_id,...,N2,N3,N5,N6,N7,N8,N9,N10,N12,N13
0,310130,1726,1383,34.811328,0.007081,0.000583,7,1,3.0,3,...,13,39,10,10,6184,10,10,10,17,13
1,1178388,592,60,2.160247,0.049343,0.012578,4,1,19.0,1,...,412,756,10,10,10634,10,208,129,10,412
2,1561460,1625,2373,55.438769,0.123499,0.023558,5,1,38.0,3,...,480,1577,10,10,8208,10,1130,380,12,480
3,1874414,1135,1686,20.463906,0.11092,0.016119,6,1,12.0,6,...,445,954,10,10,5295,10,268,69,10,423
4,2436420,779,245,23.377339,0.025814,0.004422,5,1,,0,...,151,296,10,10,8208,10,27,17,19,151


In [164]:
train.to_csv('../data/final_train_pw.csv', index=None)
test.to_csv('../data/final_test_pw.csv', index=None)

# Prueba de modelo

In [91]:
train.columns

Index(['product_id', 'block_id', 'stock', 'std_stock', 'family_id',
       'subfamily_id', 'size_id', 'color_id', 'position', 'category_id',
       'price', 'sales', 'stock_lag1', 'stock_lead1', 'std_stock_shift1',
       'mean_stock_shift1', 'min_stock_shift1', 'max_stock_shift1',
       'median_stock_shift1', 'stock_lag2', 'stock_lead2', 'std_stock_shift2',
       'min_stock_shift2', 'max_stock_shift2', 'median_stock_shift2',
       'mean_stock_shift2', 'pos_lag1', 'pos_lead1', 'std_pos_shift1',
       'mean_pos_shift1', 'min_pos_shift1', 'max_pos_shift1',
       'median_pos_shift1', 'pos_lag2', 'pos_lead2', 'std_pos_shift2',
       'min_pos_shift2', 'max_pos_shift2', 'median_pos_shift2',
       'mean_pos_shift2', 'diff_stock_lead1', 'diff_stock_lead2',
       'diff_stock_lag1', 'diff_stock_lag2', 'diff_pos_lead1',
       'diff_pos_lead2', 'diff_pos_lag1', 'diff_pos_lag2', 'std_stock_shift3',
       'mean_stock_shift3', 'min_stock_shift3', 'max_stock_shift3',
       'median_stock_shi

In [92]:
drop_cols = ['sales', 'date_number', 'product_id', 'block_id','ratio_position', 'ratio_median_stock_shift3',
       'ratio_mean_stock_shift3', 'ratio_max_stock_shift3',
       'ratio_min_stock_shift3', 'ratio_median_stock_shift2',
       'ratio_mean_stock_shift2', 'ratio_max_stock_shift2',
       'ratio_min_stock_shift2', 'ratio_median_stock_shift1',
       'ratio_mean_stock_shift1', 'ratio_max_stock_shift1',
       'ratio_min_stock_shift1', 'ratio_median_pos_shift3',
       'ratio_mean_pos_shift3', 'ratio_max_pos_shift3', 'ratio_min_pos_shift3',
       'ratio_median_pos_shift2', 'ratio_mean_pos_shift2',
       'ratio_max_pos_shift2', 'ratio_min_pos_shift2',
       'ratio_median_pos_shift1', 'ratio_mean_pos_shift1',
       'ratio_max_pos_shift1', 'ratio_min_pos_shift1'
#              'position_max', 'position_min', 'std_position', 'ratio_std_pos', 'diff_position'
            ]

col_sel = [
 'std_stock_shift1',
 'std_stock_shift2',
 'ratio_std_stock',
 'ratio_std_shift1',
 'ratio_std_shift2',]

X = train.loc[:,[c for c in train.columns if c not in drop_cols]]
# X_train = train.loc[:,[c for c in train.columns if c not in drop_cols]]
# X = train[ft_imp]
y = train[['sales']]
# y_train = train[['sales']]

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
# X = X.fillna(-1)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [110]:
model_name = 'lgbm'

params = {'max_depth':7,
          'metric':'mae',
          'max_delta_step': 0.2,
          'n_estimators':50000,
          'learning_rate':0.1,
          'colsample_bytree':0.6,
          'objective':'regression',
          'n_jobs':8,
          'seed':42,
          'lambda_l1':0,
          'lambda_l2':0,
#           'max_bin': 14,
#           'bagging_fraction':0.8,
         }

lgb_model = lgb.LGBMRegressor(**params)

In [111]:
train_ids = X_train.index
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [112]:
counter = 1
be = 0
ft_importances = np.zeros(X_train.shape[1])
full_preds = np.zeros(X_test.shape[0])
for train_index, test_index in skf.split(train_ids, y_train):
    print('--- Fold k {}:'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    y_val = np.log1p(y_val)
    y_fit = np.log1p(y_fit)

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=500,
                  early_stopping_rounds=50)

    ft_importances += lgb_model.feature_importances_

    be += np.expm1(lgb_model.best_score_['valid_0']['l1'])
#     be += lgb_model.best_score_['valid_0']['l1']
#     print('Score en el test:',mean_absolute_error(lgb_model.predict(X_test), y_test),'ventas')
    y_preds = np.expm1(lgb_model.predict(X_test))
    full_preds += y_preds
    y_preds = [int(round(x)) for x in y_preds]
    print('Score en el test:',round(mean_absolute_error(y_preds, y_test), 3),'"ventas" de error absoluto medio')
    print('Predicciones:', y_preds[:20])
    print('Ventas reales:', y_test[:20].sales.tolist(),'\n\n')
    
    counter += 1
    
# print('\n\nBEST SCORE MEAN:', be / k,'SALES :)')

full_preds = full_preds/k
full_preds = [int(round(x)) for x in full_preds]

print('IMPORTANCIA DE LAS VARIABLES:\n')
ft_importances = lgb_model.feature_importances_
imp = pd.DataFrame({'feature': X_train.columns, 'importance': ft_importances/k})
df_imp_sort = imp.sort_values('importance', ascending=False)
df_imp_sort

--- Fold k 1:
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's l1: 0.483135
[1000]	valid_0's l1: 0.467951
[1500]	valid_0's l1: 0.462525
Early stopping, best iteration is:
[1890]	valid_0's l1: 0.460034
Score en el test: 35.884 "ventas" de error absoluto medio
Predicciones: [2, 22, 1, 178, 137, 0, 14, 0, 60, 38, 148, 10, 49, 2, 36, 19, 80, 4, 0, 67]
Ventas reales: [0, 96, 0, 222, 95, 0, 14, 0, 81, 28, 192, 56, 46, 0, 12, 17, 24, 10, 0, 104] 


--- Fold k 2:
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's l1: 0.481129
[1000]	valid_0's l1: 0.466039
[1500]	valid_0's l1: 0.460985
[2000]	valid_0's l1: 0.458056
[2500]	valid_0's l1: 0.456144
Early stopping, best iteration is:
[2487]	valid_0's l1: 0.456045
Score en el test: 35.646 "ventas" de error absoluto medio
Predicciones: [1, 39, 1, 128, 154, 0, 25, 0, 40, 47, 175, 10, 42, 1, 37, 21, 62, 5, 0, 67]
Ventas reales: [0, 96, 0, 222, 95, 0, 14, 0, 81, 28, 192, 56, 46, 0, 12, 17, 24, 10, 0

Unnamed: 0,feature,importance
1,std_stock,434.6
37,diff_stock_lead1,403.8
38,diff_stock_lead2,375.4
39,diff_stock_lag1,345.4
11,std_stock_shift1,280.4
64,N9,274.8
17,stock_lead2,269.8
18,std_stock_shift2,266.2
32,std_pos_shift2,251.8
40,diff_stock_lag2,251.4


In [113]:
ft_imp = df_imp_sort.feature.values.tolist()[:10]

# Predicciones por bloque

In [114]:
product_blocks = pd.read_csv('../data/product_blocks.csv')
product_blocks.head(10)

Unnamed: 0,product_id,block_id
0,612967398,0
1,296892108,0
2,139541214,0
3,963923934,0
4,938230141,0
5,172045154,0
6,663552768,0
7,160621689,1
8,948976891,1
9,556017319,1


In [115]:
productos_por_bloque = product_blocks.groupby('block_id').count()['product_id']
productos_por_bloque.name = 'n_products'
productos_por_bloque.head()

block_id
0     7
1     7
2     7
3     6
4    10
Name: n_products, dtype: int64

In [116]:
product_blocks_n = product_blocks.join(productos_por_bloque, on='block_id', how='left')
product_blocks_n.head()

Unnamed: 0,product_id,block_id,n_products
0,612967398,0,7
1,296892108,0,7
2,139541214,0,7
3,963923934,0,7
4,938230141,0,7


In [117]:
predicciones = pd.DataFrame({'product_id': train.iloc[X_test.index.values.tolist()].product_id,
                             'preds': y_preds,
                             'price':train.iloc[X_test.index.values.tolist()].price,
                             'gain': y_preds * train.iloc[X_test.index.values.tolist()].price})

predicciones = predicciones.sort_values('gain', ascending=False)
predicciones.head()

Unnamed: 0,product_id,preds,price,gain
12514,450385036,4882,19.95,97395.9
5300,112446414,2951,25.95,76578.45
31492,450385036,3192,19.95,63680.4
20887,861287590,2198,25.95,57038.1
16889,209848499,2822,19.95,56298.9


In [118]:
predicciones_final = predicciones.join(product_blocks_n.set_index('product_id'),
                                       on='product_id', how='left').reset_index(drop=True)
predicciones_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products
0,450385036,4882,19.95,97395.9,2233,8
1,112446414,2951,25.95,76578.45,2306,3
2,450385036,3192,19.95,63680.4,2233,8
3,861287590,2198,25.95,57038.1,2037,4
4,209848499,2822,19.95,56298.9,1874,9


In [119]:
group_block_gain = predicciones_final.groupby('block_id').sum().sort_values('gain', ascending=False)['gain']
group_block_gain.name = 'gain_per_block'

In [120]:
preds_final = predicciones_final.join(group_block_gain, on='block_id')
preds_final = preds_final.sort_values('gain_per_block', ascending=False)
preds_final = preds_final.reset_index(drop=True)

In [121]:
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block
0,450385036,4882,19.95,97395.9,2233,8,211988.85
1,921904870,62,22.95,1422.9,2233,8,211988.85
2,577440236,2,22.95,45.9,2233,8,211988.85
3,921904870,15,22.95,344.25,2233,8,211988.85
4,825443526,25,25.95,648.75,2233,8,211988.85


In [122]:
preds_final = preds_final.iloc[preds_final.block_id.drop_duplicates().index.values.tolist()]
preds_final = preds_final.sort_values('gain_per_block', ascending=False)

In [123]:
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block
0,450385036,4882,19.95,97395.9,2233,8,211988.85
9,313178369,864,59.95,51796.8,918,9,128799.35
16,474344355,19,12.95,246.05,63,7,106233.1
27,861287590,1451,25.95,37653.45,2037,4,101945.5
32,519578166,1400,25.95,36330.0,1104,3,99673.95


In [124]:
# predicciones_final = predicciones_final.iloc[predicciones_final.block_id.drop_duplicates().index.values.tolist()]
# predicciones_final.head()

In [125]:
preds_final['product_cumsum'] = preds_final.n_products.cumsum()
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block,product_cumsum
0,450385036,4882,19.95,97395.9,2233,8,211988.85,8
9,313178369,864,59.95,51796.8,918,9,128799.35,17
16,474344355,19,12.95,246.05,63,7,106233.1,24
27,861287590,1451,25.95,37653.45,2037,4,101945.5,28
32,519578166,1400,25.95,36330.0,1104,3,99673.95,31


In [126]:
bet_blocks = preds_final[preds_final.product_cumsum <= 100]
bet_blocks

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block,product_cumsum
0,450385036,4882,19.95,97395.9,2233,8,211988.85,8
9,313178369,864,59.95,51796.8,918,9,128799.35,17
16,474344355,19,12.95,246.05,63,7,106233.1,24
27,861287590,1451,25.95,37653.45,2037,4,101945.5,28
32,519578166,1400,25.95,36330.0,1104,3,99673.95,31
35,112446414,2951,25.95,76578.45,2306,3,92667.45,34
37,245944269,22,25.95,570.9,2510,9,89694.0,43
47,775724247,63,22.95,1445.85,2578,5,86499.4,48
53,638628517,971,19.95,19371.45,2574,10,85753.15,58
64,923532000,0,22.95,0.0,726,10,84589.85,68


### Bloques para la apuesta final:

In [127]:
bet_blocks.block_id

0     2233
9      918
16      63
27    2037
32    1104
35    2306
37    2510
47    2578
53    2574
64     726
74     387
82    2223
94     728
Name: block_id, dtype: int64

In [43]:
from sklearn.linear_model import Lasso

In [44]:
lasso = Lasso()

In [45]:
train_ids = X_train.index
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [46]:
counter = 1
be = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold k {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit = X_fit.fillna(-1)
    X_val = X_val.fillna(-1)
    
#     y_val = np.log1p(y_val)
#     y_fit = np.log1p(y_fit)
    
    lasso.fit(X_fit, y_fit)
    
    print('Score en el test:',mean_absolute_error(lasso.predict(X_test.fillna(-1)), y_test),'ventas')
#     print('Score en el test:',mean_absolute_error(np.expm1(lasso.predict(X_test.fillna(-1))), y_test),'ventas')
    
    counter += 1
    
    
# print('\n\nBEST SCORE MEAN:', be / k,'SALES :)')

Fold k 1

Score en el test: 11.170156919880345 ventas
Fold k 2

Score en el test: 11.252111454322224 ventas
Fold k 3



KeyboardInterrupt: 

In [64]:
from sklearn.model_selection import TimeSeriesSplit

In [67]:
tscv = TimeSeriesSplit(n_splits=5)

In [68]:
counter = 1
be = 0
for train_index, test_index in tscv.split(X):
    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]
    
#     y_val = np.log1p(y_val)
#     y_fit = np.log1p(y_fit)
    
    
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=1000,
                  early_stopping_rounds=20)


#     be += np.expm1(lgb_model.best_score_['valid_0']['l1'])
    be += lgb_model.best_score_['valid_0']['l1']
    
#     print('Score en el test:',mean_absolute_error(lgb_model.predict(X_test), y_test),'ventas')
#     print('Score en el test:',mean_absolute_error(np.expm1(lgb_model.predict(X_test)), y_test),'ventas')
    
    counter += 1

Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 14.9035
[2000]	valid_0's l1: 14.148
[3000]	valid_0's l1: 13.7446
Early stopping, best iteration is:
[3494]	valid_0's l1: 13.6114
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 12.3938
[2000]	valid_0's l1: 11.7195
[3000]	valid_0's l1: 11.3735
Early stopping, best iteration is:
[3420]	valid_0's l1: 11.2791
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 10.2273
Early stopping, best iteration is:
[1032]	valid_0's l1: 10.1833
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[823]	valid_0's l1: 9.79996
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 8.96911
Early stopping, best iteration is:
[1033]	valid_0's l1: 8.93757
