# Librer√≠as

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import seaborn as sns

%matplotlib inline

# Predicciones en csv

In [17]:
preds_lightgbm = pd.read_csv('../predictions/preds_lightgbm.csv')
preds_catboost = pd.read_csv('../predictions/preds_catboost.csv')
preds_xgboost = pd.read_csv('../predictions/preds_xgboost.csv')

In [20]:
full_preds = preds_lightgbm.join(preds_catboost.set_index('Unnamed: 0'), on='Unnamed: 0')\
.join(preds_xgboost.set_index('Unnamed: 0'), on='Unnamed: 0')
full_preds.head(10)

Unnamed: 0.1,Unnamed: 0,predicciones_lightgbm,predicciones_catboost,predicciones_xgboost
0,0,38,35,23
1,1,41,46,53
2,2,2,2,1
3,3,13,15,10
4,4,38,41,37
5,5,47,45,48
6,6,46,39,32
7,7,0,0,0
8,8,1,1,1
9,9,30,32,26


In [22]:
full_preds['media'] = (full_preds['predicciones_lightgbm'] +
                       full_preds['predicciones_catboost'] +
                       full_preds['predicciones_xgboost'])/3
full_preds.head(10)

Unnamed: 0.1,Unnamed: 0,predicciones_lightgbm,predicciones_catboost,predicciones_xgboost,media
0,0,38,35,23,32.0
1,1,41,46,53,46.666667
2,2,2,2,1,1.666667
3,3,13,15,10,12.666667
4,4,38,41,37,38.666667
5,5,47,45,48,46.666667
6,6,46,39,32,39.0
7,7,0,0,0,0.0
8,8,1,1,1,1.0
9,9,30,32,26,29.333333


In [23]:
y_preds = [int(round(x)) for x in full_preds['media'].values.tolist()]

In [24]:
y_preds[:20]

[32, 47, 2, 13, 39, 47, 39, 0, 1, 29, 7, 35, 1, 0, 52, 116, 92, 32, 28, 93]

# Apuesta de bloque

In [25]:
product_blocks = pd.read_csv('../data/product_blocks.csv')
product_blocks.head(10)

Unnamed: 0,product_id,block_id
0,612967398,0
1,296892108,0
2,139541214,0
3,963923934,0
4,938230141,0
5,172045154,0
6,663552768,0
7,160621689,1
8,948976891,1
9,556017319,1


In [26]:
productos_por_bloque = product_blocks.groupby('block_id').count()['product_id']
productos_por_bloque.name = 'n_products'
productos_por_bloque.head()

block_id
0     7
1     7
2     7
3     6
4    10
Name: n_products, dtype: int64

In [27]:
product_blocks_n = product_blocks.join(productos_por_bloque, on='block_id', how='left')
product_blocks_n.head()

Unnamed: 0,product_id,block_id,n_products
0,612967398,0,7
1,296892108,0,7
2,139541214,0,7
3,963923934,0,7
4,938230141,0,7


In [33]:
test = pd.read_csv('../data/final_test.csv', usecols=['product_id', 'price'])

In [34]:
test.head()

Unnamed: 0,product_id,price
0,151926,25.95
1,213413,19.95
2,310130,12.95
3,455200,29.95
4,571044,15.95


In [38]:
predicciones = pd.DataFrame({'product_id': test.product_id,
                             'preds': y_preds,
                             'price':test.price,
                             'gain': y_preds * test.price})

predicciones = predicciones.sort_values('gain', ascending=False)
predicciones.head()

Unnamed: 0,product_id,preds,price,gain
5480,617359148,182,69.95,12730.9
2481,276600836,203,55.95,11357.85
4668,518508466,185,59.95,11090.75
1309,146521312,73,149.0,10877.0
3270,362727629,148,69.95,10352.6


In [39]:
predicciones.shape, test.shape

((62244, 4), (62244, 2))

In [40]:
predicciones_final = predicciones.join(product_blocks_n.set_index('product_id'),
                                       on='product_id', how='left').reset_index(drop=True)
predicciones_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products
0,617359148,182,69.95,12730.9,2442,4
1,276600836,203,55.95,11357.85,447,7
2,518508466,185,59.95,11090.75,1166,5
3,146521312,73,149.0,10877.0,165,2
4,362727629,148,69.95,10352.6,879,7


In [62]:
group_block_gain = predicciones_final.groupby('block_id').sum().sort_values('gain', ascending=False)['gain']
group_block_gain.name = 'gain_per_block'

In [63]:
preds_final = predicciones_final.join(group_block_gain, on='block_id')
preds_final = preds_final.sort_values('gain_per_block', ascending=False)
preds_final = preds_final.reset_index(drop=True)

In [64]:
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block
0,38527509,1,3.95,3.95,21,10,22526.54
1,38527509,65,3.95,256.75,21,10,22526.54
2,772329315,87,39.95,3475.65,21,10,22526.54
3,38527509,1,3.95,3.95,21,10,22526.54
4,587806269,0,59.95,0.0,21,10,22526.54


In [65]:
preds_final = preds_final.iloc[preds_final.block_id.drop_duplicates().index.values.tolist()]
preds_final = preds_final.sort_values('gain_per_block', ascending=False)

In [66]:
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block
0,38527509,1,3.95,3.95,21,10,22526.54
42,33137015,0,29.95,0.0,530,10,17448.65
91,861906184,95,29.95,2845.25,1121,10,17395.25
133,857429269,2,39.95,79.9,2671,8,16191.15
182,868698701,0,19.95,0.0,442,10,15724.95


In [67]:
preds_final[preds_final.block_id==530]

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block
42,33137015,0,29.95,0.0,530,10,17448.65


In [68]:
preds_final['product_cumsum'] = preds_final.n_products.cumsum()
preds_final.head()

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block,product_cumsum
0,38527509,1,3.95,3.95,21,10,22526.54,10
42,33137015,0,29.95,0.0,530,10,17448.65,20
91,861906184,95,29.95,2845.25,1121,10,17395.25,30
133,857429269,2,39.95,79.9,2671,8,16191.15,38
182,868698701,0,19.95,0.0,442,10,15724.95,48


In [69]:
bet_blocks = preds_final[preds_final.product_cumsum <= 100]
bet_blocks

Unnamed: 0,product_id,preds,price,gain,block_id,n_products,gain_per_block,product_cumsum
0,38527509,1,3.95,3.95,21,10,22526.54,10
42,33137015,0,29.95,0.0,530,10,17448.65,20
91,861906184,95,29.95,2845.25,1121,10,17395.25,30
133,857429269,2,39.95,79.9,2671,8,16191.15,38
182,868698701,0,19.95,0.0,442,10,15724.95,48
224,130314002,0,35.95,0.0,418,9,15717.8,57
273,441411673,0,19.95,0.0,1403,10,15263.15,67
308,336539770,0,29.95,0.0,1677,9,15195.9,76
357,579379740,0,69.95,0.0,1409,7,15132.0,83
385,654226641,0,19.95,0.0,1446,10,14989.25,93


In [72]:
bet_blocks[['block_id', 'gain_per_block', 'product_cumsum']].to_csv('../bets/week1_bets.csv', index=None)

### Posibles bloques para la apuesta final:

In [70]:
bet_blocks.block_id

0        21
42      530
91     1121
133    2671
182     442
224     418
273    1403
308    1677
357    1409
385    1446
Name: block_id, dtype: int64

In [43]:
from sklearn.linear_model import Lasso

In [44]:
lasso = Lasso()

In [45]:
train_ids = X_train.index
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [46]:
counter = 1
be = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold k {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_fit = X_fit.fillna(-1)
    X_val = X_val.fillna(-1)
    
#     y_val = np.log1p(y_val)
#     y_fit = np.log1p(y_fit)
    
    lasso.fit(X_fit, y_fit)
    
    print('Score en el test:',mean_absolute_error(lasso.predict(X_test.fillna(-1)), y_test),'ventas')
#     print('Score en el test:',mean_absolute_error(np.expm1(lasso.predict(X_test.fillna(-1))), y_test),'ventas')
    
    counter += 1
    
    
# print('\n\nBEST SCORE MEAN:', be / k,'SALES :)')

Fold k 1

Score en el test: 11.170156919880345 ventas
Fold k 2

Score en el test: 11.252111454322224 ventas
Fold k 3



KeyboardInterrupt: 

In [64]:
from sklearn.model_selection import TimeSeriesSplit

In [67]:
tscv = TimeSeriesSplit(n_splits=5)

In [68]:
counter = 1
be = 0
for train_index, test_index in tscv.split(X):
    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]
    
#     y_val = np.log1p(y_val)
#     y_fit = np.log1p(y_fit)
    
    
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=1000,
                  early_stopping_rounds=20)


#     be += np.expm1(lgb_model.best_score_['valid_0']['l1'])
    be += lgb_model.best_score_['valid_0']['l1']
    
#     print('Score en el test:',mean_absolute_error(lgb_model.predict(X_test), y_test),'ventas')
#     print('Score en el test:',mean_absolute_error(np.expm1(lgb_model.predict(X_test)), y_test),'ventas')
    
    counter += 1

Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 14.9035
[2000]	valid_0's l1: 14.148
[3000]	valid_0's l1: 13.7446
Early stopping, best iteration is:
[3494]	valid_0's l1: 13.6114
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 12.3938
[2000]	valid_0's l1: 11.7195
[3000]	valid_0's l1: 11.3735
Early stopping, best iteration is:
[3420]	valid_0's l1: 11.2791
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 10.2273
Early stopping, best iteration is:
[1032]	valid_0's l1: 10.1833
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[823]	valid_0's l1: 9.79996
Training until validation scores don't improve for 20 rounds.
[1000]	valid_0's l1: 8.96911
Early stopping, best iteration is:
[1033]	valid_0's l1: 8.93757
