In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, StratifiedKFold


##########################################################\
**EDA**\
##########################################################

In [85]:
traindf = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')

testdf = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')

In [86]:
print(traindf.head(10))

  warehouse        date  orders holiday_name  holiday  shutdown  \
0  Prague_1  2020-12-05  6895.0          NaN        0         0   
1  Prague_1  2020-12-06  6584.0          NaN        0         0   
2  Prague_1  2020-12-07  7030.0          NaN        0         0   
3  Prague_1  2020-12-08  6550.0          NaN        0         0   
4  Prague_1  2020-12-09  6910.0          NaN        0         0   
5  Prague_1  2020-12-10  7228.0          NaN        0         0   
6  Prague_1  2020-12-11  7790.0          NaN        0         0   
7  Prague_1  2020-12-12  7165.0          NaN        0         0   
8  Prague_1  2020-12-13  6844.0          NaN        0         0   
9  Prague_1  2020-12-14  7010.0          NaN        0         0   

   mini_shutdown  shops_closed  winter_school_holidays  school_holidays  \
0              0             0                       0                0   
1              0             0                       0                0   
2              0             0       

In [87]:
print(testdf.head(10))

  warehouse        date holiday_name  holiday  shops_closed  \
0  Prague_1  2024-03-16          NaN        0             0   
1  Prague_1  2024-03-17          NaN        0             0   
2  Prague_1  2024-03-18          NaN        0             0   
3  Prague_1  2024-03-19          NaN        0             0   
4  Prague_1  2024-03-20          NaN        0             0   
5  Prague_1  2024-03-21          NaN        0             0   
6  Prague_1  2024-03-22          NaN        0             0   
7  Prague_1  2024-03-23          NaN        0             0   
8  Prague_1  2024-03-24          NaN        0             0   
9  Prague_1  2024-03-25          NaN        0             0   

   winter_school_holidays  school_holidays                   id  
0                       0                0  Prague_1_2024-03-16  
1                       0                0  Prague_1_2024-03-17  
2                       0                0  Prague_1_2024-03-18  
3                       0                0

**Adding row for week of the year. Generating those data from day value of the dataset.**

In [88]:
import datetime

def date_to_week(date_str):
    date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    week_number = date.isocalendar()[1]
    return f"{week_number:02d}"


traindf['week_number'] = traindf['date'].apply(date_to_week)
testdf['week_number'] = testdf['date'].apply(date_to_week)

**Updating day with date. Week day may be more important than day.**

In [89]:
traindf['date'] = pd.to_datetime(traindf['date'])
traindf['date'] = traindf['date'].dt.day_name()

testdf['date'] = pd.to_datetime(testdf['date'])
testdf['date'] = testdf['date'].dt.day_name()

In [90]:
warehouse = {'Prague_1': 1,
'Brno_1':2,
'Prague_2':3,
'Prague_3':4,
'Budapest_1':5,
'Munich_1':6,
'Frankfurt_1':7}

traindf['warehouse'] = traindf['warehouse'].map(warehouse)
testdf['warehouse'] = testdf['warehouse'].map(warehouse)

date = {'Wednesday': 1,
'Thursday':2,
'Saturday':3,
'Tuesday':4,
'Friday':5,
'Monday':6,
'Sunday':7}

traindf['date'] = traindf['date'].map(date)
testdf['date'] = testdf['date'].map(date)

In [91]:
holiday_name = {
"International womens day": 1,
"Christmas Eve": 2,
"2nd Christmas Day": 3,
"Good Friday": 4,
"New Years Day": 5,
"Den osvobozeni": 6,
"Easter Monday": 7,
"Den ceske statnosti": 8,
"Labour Day": 9,
"Cyrila a Metodej": 10,
"Jan Hus": 11,
"Den vzniku samostatneho ceskoslovenskeho statu": 12,
"Den boje za svobodu a demokracii": 13,
"Memorial Day of the Republic": 14,
"Independent Hungary Day": 15,
"Day of National Unity": 16,
"Reformation Day": 17,
"National Defense Day": 18,
"Memorial Day for the Victims of the Holocaust": 19,
"Memorial Day for the Martyrs of Arad": 20,
"Memorial Day for the Victims of the Communist Dictatorships": 21,
"All Saints' Day Holiday": 22,
"1848 Revolution Memorial Day (Extra holiday)": 23,
"Peace Festival in Augsburg": 24}

traindf['holiday_name'] = traindf['holiday_name'].map(holiday_name)
traindf.fillna(0, inplace=True)

testdf['holiday_name'] = testdf['holiday_name'].map(holiday_name)
testdf.fillna(0, inplace=True)

In [92]:
traindf = traindf.drop(['id', 'shutdown', 'mini_shutdown', 'blackout', 'mov_change', 
                        'frankfurt_shutdown', 'precipitation', 'snow', 'user_activity_1',
                       'user_activity_2'], axis=1)

In [95]:
train_X = traindf.drop('orders', axis=1)
train_y = traindf['orders']

In [102]:
test_id = testdf['id']
testdf = testdf.drop('id', axis=1)

      warehouse  date  holiday_name  holiday  shops_closed  \
0             1     3           0.0        0             0   
1             1     7           0.0        0             0   
2             1     6           0.0        0             0   
3             1     4           0.0        0             0   
4             1     1           0.0        0             0   
...         ...   ...           ...      ...           ...   
7335          5     7           0.0        0             0   
7336          5     6           0.0        0             0   
7337          5     4           0.0        0             0   
7338          5     1           0.0        0             0   
7339          5     2           0.0        0             0   

      winter_school_holidays  school_holidays week_number  
0                          0                0          49  
1                          0                0          49  
2                          0                0          50  
3              

In [93]:
print(traindf.head(10))
print('############################################################')
print(testdf.head(10))

   warehouse  date  orders  holiday_name  holiday  shops_closed  \
0          1     3  6895.0           0.0        0             0   
1          1     7  6584.0           0.0        0             0   
2          1     6  7030.0           0.0        0             0   
3          1     4  6550.0           0.0        0             0   
4          1     1  6910.0           0.0        0             0   
5          1     2  7228.0           0.0        0             0   
6          1     5  7790.0           0.0        0             0   
7          1     3  7165.0           0.0        0             0   
8          1     7  6844.0           0.0        0             0   
9          1     6  7010.0           0.0        0             0   

   winter_school_holidays  school_holidays week_number  
0                       0                0          49  
1                       0                0          49  
2                       0                0          50  
3                       0         

**#####################################
Defining Stacking function.
#####################################**

In [122]:
def Stacking(model,train_X, test, y, n_fold):
    
    folds = StratifiedKFold(n_splits=n_fold)
    test_pred = np.zeros((test.shape[0], 1))
    train_pred = np.zeros((train_X.shape[0], 1))
    
    for train_indices, val_indices in folds.split(train_X, y.values):
        x_train, x_val = train_X.iloc[train_indices], train_X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        model.fit(x_train, y_train)

        train_pred[val_indices, 0] = model.predict(x_val)
        
        test_pred[:, 0] += model.predict(test) / n_fold

    return train_pred, test_pred

################################################################\
**Getting level 0 prediction and creating dataset for level 1**\
################################################################

**Decision Tree**

In [123]:
model1 = DecisionTreeRegressor(random_state=1)

train_pred1, test_pred1 =Stacking(model=model1, n_fold=5, train_X=train_X, test=testdf, y=train_y)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)



**KNN**

In [124]:
model2 = KNeighborsRegressor()

train_pred2, test_pred2 =Stacking(model=model1, n_fold=5, train_X=train_X, test=testdf, y=train_y)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)



**SVR**

In [125]:
model3 = SVR()

train_pred3, test_pred3 =Stacking(model=model1, n_fold=5, train_X=train_X, test=testdf, y=train_y)

train_pred3=pd.DataFrame(train_pred3)
test_pred3=pd.DataFrame(test_pred3)



################################################\
**Level 1 of model training and prediction**\
################################################

In [126]:
df = pd.concat([train_pred1, train_pred2, train_pred3], axis=1)
test = pd.concat([test_pred1, test_pred2, test_pred3], axis=1)

df = df.astype(np.float32)

print(df.head(20))
print(test.head(40))

model = RandomForestRegressor()
model.fit(df, train_y)
prediction = model.predict(test)

               0             0             0
0    9804.000000   9804.000000   9804.000000
1    9604.000000   9604.000000   9604.000000
2    9672.000000   9672.000000   9672.000000
3    9476.000000   9476.000000   9476.000000
4    9670.000000   9670.000000   9670.000000
5   10024.000000  10024.000000  10024.000000
6   10727.666992  10727.666992  10727.666992
7   10015.333008  10015.333008  10015.333008
8    9018.000000   9018.000000   9018.000000
9   10784.000000  10784.000000  10784.000000
10  10795.000000  10795.000000  10795.000000
11  10968.000000  10968.000000  10968.000000
12  11870.666992  11870.666992  11870.666992
13  15155.000000  15155.000000  15155.000000
14  18139.000000  18139.000000  18139.000000
15   9396.500000   9396.500000   9396.500000
16   8692.000000   8692.000000   8692.000000
17   8216.500000   8216.500000   8216.500000
18   8748.000000   8748.000000   8748.000000
19   9829.333008   9829.333008   9829.333008
              0            0            0
0   8513.0000

#####################################################\
**Creating submission file**\
#####################################################

In [127]:
submission = pd.DataFrame({
    'id': test_id,
    'orders': prediction
})

# Save
submission.to_csv('submission.csv', index=False)