In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
full_train = pd.read_csv('data/train.csv')
full_test = pd.read_csv('data/test.csv')
full_train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [4]:
full_test.head()

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [5]:
from sklearn.preprocessing import LabelEncoder
def pre_le(df, features):
    out = df.copy()
    for feature in features:
        le = LabelEncoder()
        le.fit(full_train[feature])
        out[f'le_{feature}'] = le.transform(df[feature])
    return out

In [6]:
train = pre_le(full_train, ['country', 'store', 'product'])
test = pre_le(full_test, ['country', 'store', 'product'])

In [7]:
def ext_ymd(df):
    out = df.copy()
    add = df['date'].str.split('-', expand=True).applymap(np.int)
    add.columns=['year', 'month', 'day']
    return pd.concat([out, add], axis=1)


In [8]:
train = ext_ymd(train)
test = ext_ymd(test)

In [9]:
train = train.set_index('row_id', drop=True)
test = test.set_index('row_id', drop=True)

In [10]:
drops = ['date', 'country', 'store', 'product']
train.drop(drops, axis=1, inplace=True)
test.drop(drops, axis=1, inplace=True)

In [11]:
train.head()

Unnamed: 0_level_0,num_sold,le_country,le_store,le_product,year,month,day
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,329,0,0,1,2015,1,1
1,520,0,0,0,2015,1,1
2,146,0,0,2,2015,1,1
3,572,0,1,1,2015,1,1
4,911,0,1,0,2015,1,1


In [12]:
test.head()

Unnamed: 0_level_0,le_country,le_store,le_product,year,month,day
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26298,0,0,1,2019,1,1
26299,0,0,0,2019,1,1
26300,0,0,2,2019,1,1
26301,0,1,1,2019,1,1
26302,0,1,0,2019,1,1


### do nothing

In [13]:
X = train.copy()
y = X.pop('num_sold')
test_X = test.copy()

In [14]:
X.head()

Unnamed: 0_level_0,le_country,le_store,le_product,year,month,day
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,2015,1,1
1,0,0,0,2015,1,1
2,0,0,2,2015,1,1
3,0,1,1,2015,1,1
4,0,1,0,2015,1,1


In [15]:
from xgboost import XGBRegressor
xgb = XGBRegressor(
    eta=0.1
)
xgb.fit(X, y)
pred = xgb.predict(test_X)

In [16]:
out = pd.DataFrame({
    'row_id': full_test['row_id'],
    'num_sold': pred
})

In [17]:
out.head()

Unnamed: 0,row_id,num_sold
0,26298,394.650482
1,26299,632.351257
2,26300,195.868576
3,26301,696.892639
4,26302,1091.807007


In [18]:
out.to_csv('./baseline0.csv', index=False)
# score = 11.32674

In [19]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def score_xgb_kfold(X, y, test_X, eta=0.1):
    result = []
    scores = []
    for train_idx, valid_idx in kf.split(X, y):
        train_X = X.iloc[train_idx]
        valid_X = X.iloc[valid_idx]
        train_y = y.iloc[train_idx]
        valid_y = y.iloc[valid_idx]
        xgb = XGBRegressor(
            n_estimators=10000,
            random_state=42,
            eta=eta
        )
        xgb.fit(
            train_X, train_y,
            early_stopping_rounds=100,
            eval_set=[(valid_X, valid_y)],
            verbose=50
        )
        result.append(xgb.predict(test_X))
        scores.append(xgb.best_score)
    
    return [result, scores]

results, scores = score_xgb_kfold(X, y, test_X)



[0]	validation_0-rmse:426.37817
[50]	validation_0-rmse:62.06910
[100]	validation_0-rmse:56.82235
[150]	validation_0-rmse:53.08333
[200]	validation_0-rmse:50.22179
[250]	validation_0-rmse:48.43843
[300]	validation_0-rmse:47.23341
[350]	validation_0-rmse:45.74092
[400]	validation_0-rmse:44.91613
[450]	validation_0-rmse:44.10075
[500]	validation_0-rmse:43.02688
[550]	validation_0-rmse:42.13851
[600]	validation_0-rmse:41.38398
[650]	validation_0-rmse:40.79472
[700]	validation_0-rmse:40.33329
[750]	validation_0-rmse:39.95870
[800]	validation_0-rmse:39.47558
[850]	validation_0-rmse:39.20258
[900]	validation_0-rmse:38.98410
[950]	validation_0-rmse:38.64457
[1000]	validation_0-rmse:38.43111
[1050]	validation_0-rmse:38.22881
[1100]	validation_0-rmse:37.99885
[1150]	validation_0-rmse:37.80731
[1200]	validation_0-rmse:37.68413
[1250]	validation_0-rmse:37.50163
[1300]	validation_0-rmse:37.33464
[1350]	validation_0-rmse:37.25781
[1400]	validation_0-rmse:37.14535
[1450]	validation_0-rmse:37.00731
[1

In [20]:
scores

[34.861332, 36.634113, 34.986076, 35.864887, 35.853592]

In [21]:
results = pd.DataFrame(results).T
results.head()

Unnamed: 0,0,1,2,3,4
0,412.907867,400.901703,408.319305,409.925232,401.973572
1,616.982544,624.344849,618.577087,616.717834,635.024414
2,175.942612,174.825714,173.622498,176.511047,178.411392
3,714.360229,712.612244,602.812561,700.241089,718.233643
4,1045.725708,1105.031616,1040.977051,1050.284302,1040.750244


In [22]:
pred = results.mean(axis=1)
pred.head()

0     406.805536
1     622.329346
2     175.862653
3     689.651953
4    1056.553784
dtype: float64

In [23]:
out = pd.DataFrame({
    'row_id': full_test['row_id'],
    'num_sold': pred
})
out.head()

Unnamed: 0,row_id,num_sold
0,26298,406.805536
1,26299,622.329346
2,26300,175.862653
3,26301,689.651953
4,26302,1056.553784


In [24]:
out.to_csv(
    'baseline_kfold.csv',
    index=False
)
# score: 10.45348


## adding weekday info

In [25]:
import datetime as dt

In [26]:
X.head()

Unnamed: 0_level_0,le_country,le_store,le_product,year,month,day
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,2015,1,1
1,0,0,0,2015,1,1
2,0,0,2,2015,1,1
3,0,1,1,2015,1,1
4,0,1,0,2015,1,1


In [27]:
X['weekday'] = pd.to_datetime(
    X[['year', 'month', 'day']]
).dt.weekday

In [28]:
test_X['weekday'] = pd.to_datetime(
    test_X[['year', 'month', 'day']]
).dt.weekday

In [29]:
results2, scores2 = score_xgb_kfold(X, y, test_X)

[0]	validation_0-rmse:426.49982
[50]	validation_0-rmse:43.32092
[100]	validation_0-rmse:35.20581
[150]	validation_0-rmse:33.02781
[200]	validation_0-rmse:31.29678
[250]	validation_0-rmse:30.57571
[300]	validation_0-rmse:30.16265
[350]	validation_0-rmse:29.74012
[400]	validation_0-rmse:29.44718
[450]	validation_0-rmse:29.27487
[500]	validation_0-rmse:29.15210
[550]	validation_0-rmse:29.11538
[600]	validation_0-rmse:29.03598
[650]	validation_0-rmse:28.93058
[700]	validation_0-rmse:28.89637
[750]	validation_0-rmse:28.83608
[800]	validation_0-rmse:28.82518
[850]	validation_0-rmse:28.81712
[862]	validation_0-rmse:28.81710
[0]	validation_0-rmse:428.95706
[50]	validation_0-rmse:48.97757
[100]	validation_0-rmse:39.00986
[150]	validation_0-rmse:36.13636
[200]	validation_0-rmse:33.87432
[250]	validation_0-rmse:32.64958
[300]	validation_0-rmse:32.00164
[350]	validation_0-rmse:31.65492
[400]	validation_0-rmse:31.44569
[450]	validation_0-rmse:31.25582
[500]	validation_0-rmse:31.05090
[550]	validati

In [30]:
scores2

[28.812466, 30.540237, 28.11904, 28.521111, 29.662239]

In [31]:
results2 = pd.DataFrame(results2).T
results2.head()

Unnamed: 0,0,1,2,3,4
0,397.276062,409.089447,413.905182,401.212219,401.337036
1,584.553894,618.694702,639.243225,615.58783,640.405762
2,172.639755,178.387222,194.043961,194.857544,175.887024
3,679.040405,698.67627,613.682068,669.207153,681.485229
4,1019.246155,1060.575073,1042.529053,1031.053589,984.34314


In [32]:
preds = results2.mean(axis=1)
preds.head()

0     404.563989
1     619.697083
2     183.163101
3     668.418225
4    1027.549402
dtype: float64

In [33]:
out = pd.DataFrame({
    'row_id': full_test['row_id'],
    'num_sold': preds
})
out.head()

Unnamed: 0,row_id,num_sold
0,26298,404.563989
1,26299,619.697083
2,26300,183.163101
3,26301,668.418225
4,26302,1027.549402


In [34]:
out.to_csv('with_weekdays20220128.csv', index=False)

In [50]:
X.head()

Unnamed: 0_level_0,le_country,le_store,le_product,year,month,day,weekday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,1,2015,1,1,3
1,0,0,0,2015,1,1,3
2,0,0,2,2015,1,1,3
3,0,1,1,2015,1,1,3
4,0,1,0,2015,1,1,3


In [51]:
test_X.head()

Unnamed: 0_level_0,le_country,le_store,le_product,year,month,day,weekday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
26298,0,0,1,2019,1,1,1
26299,0,0,0,2019,1,1,1
26300,0,0,2,2019,1,1,1
26301,0,1,1,2019,1,1,1
26302,0,1,0,2019,1,1,1


### convert day to serial_day from 1st Jan.

In [53]:
X['sday'] = pd.to_datetime(X[['year', 'month', 'day']]).dt.dayofyear

In [54]:
test_X['sday'] = pd.to_datetime(test_X[['year', 'month', 'day']]).dt.dayofyear

In [64]:
results3, scores3 = score_xgb_kfold(X, y, test_X, eta=0.03)

[0]	validation_0-rmse:458.00244
[50]	validation_0-rmse:118.25147
[100]	validation_0-rmse:55.24551
[150]	validation_0-rmse:43.28917
[200]	validation_0-rmse:38.90112
[250]	validation_0-rmse:36.13833
[300]	validation_0-rmse:34.34121
[350]	validation_0-rmse:33.42573
[400]	validation_0-rmse:32.84281
[450]	validation_0-rmse:32.25363
[500]	validation_0-rmse:31.56873
[550]	validation_0-rmse:31.01458
[600]	validation_0-rmse:30.58668
[650]	validation_0-rmse:30.36816
[700]	validation_0-rmse:30.14821
[750]	validation_0-rmse:29.98804
[800]	validation_0-rmse:29.89901
[850]	validation_0-rmse:29.64291
[900]	validation_0-rmse:29.46349
[950]	validation_0-rmse:29.41439
[1000]	validation_0-rmse:29.30040
[1050]	validation_0-rmse:29.25897
[1100]	validation_0-rmse:29.23611
[1150]	validation_0-rmse:29.21549
[1200]	validation_0-rmse:29.15717
[1250]	validation_0-rmse:29.12448
[1300]	validation_0-rmse:29.12872
[1350]	validation_0-rmse:29.13255
[1361]	validation_0-rmse:29.12110
[0]	validation_0-rmse:459.98288
[50

In [65]:
scores2

[28.812466, 30.540237, 28.11904, 28.521111, 29.662239]

In [66]:
scores3

[29.1187, 30.170406, 27.618389, 27.745291, 29.268229]

In [68]:
results3

[array([ 420.65363,  617.4335 ,  174.45381, ..., 1021.9554 , 1433.4446 ,
         396.70563], dtype=float32),
 array([ 416.36984,  625.9053 ,  179.8385 , ...,  929.1623 , 1438.0675 ,
         389.00778], dtype=float32),
 array([ 406.59613,  625.28625,  181.7905 , ...,  996.2628 , 1446.896  ,
         395.57428], dtype=float32),
 array([ 404.17358,  618.67926,  198.03099, ..., 1011.9286 , 1444.871  ,
         383.09375], dtype=float32),
 array([ 407.74136,  623.9006 ,  178.29012, ..., 1023.7776 , 1520.6176 ,
         383.35718], dtype=float32)]

In [69]:
preds = pd.DataFrame(results3).T.mean(axis=1)

In [71]:
out = pd.DataFrame({
    'row_id': full_test['row_id'],
    'num_sold': preds
})
out.head()

Unnamed: 0,row_id,num_sold
0,26298,411.106909
1,26299,622.240967
2,26300,182.480783
3,26301,678.351819
4,26302,1044.125024


In [72]:
out.to_csv('./20220131.csv', index=False)