In [1]:
!pip install -U -q autogluon > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.13.0 requires aiohttp<4.0.0,>=3.9.2, but you have aiohttp 3.9.1 which is incompatible.
aiobotocore 2.13.0 requires botocore<1.34.107,>=1.34.70, but you have botocore 1.29.165 which is incompatible.
albumentations 1.4.0 requires scikit-image>=0.21.0, but you have scikit-image 0.20.0 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
ydata-profiling 4.6.4 requires numpy<1.26,>=1.16.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import mutual_info_regression
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [3]:
TIME_LIMIT = 3600 * 10
SEED = 27

In [4]:
train = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')
test = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')

test_ids = test['id'].values

train.shape, test.shape

((7340, 18), (397, 8))

In [5]:
missing_cols_in_test = [
    'shutdown',
    'mini_shutdown',
    'blackout',
    'mov_change',
    'frankfurt_shutdown',
    'precipitation',
    'snow',
    'user_activity_1',
    'user_activity_2'
]

train = train.drop(missing_cols_in_test, axis=1, errors='ignore')
train.shape, test.shape

((7340, 9), (397, 8))

In [6]:
train['is_train'] = 1
test['is_train'] = 0

combined = pd.concat([train, test], axis=0).reset_index(drop=True)

# Feature Engineering

In [7]:
def add_tfidf_features(df, ngram_range=(1, 4), max_features=50):
    df['holiday_name'] = df['holiday_name'].fillna('')

    vectorizer = TfidfVectorizer(
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        strip_accents='unicode',
        analyzer='word',
        ngram_range=ngram_range,
        sublinear_tf=True,
        max_features=max_features,
    )
    
    vectorized = vectorizer.fit_transform(df['holiday_name'])
    vectorized = vectorized.toarray()
    tfidf_df = pd.DataFrame(vectorized, columns=[f'tfidf_feat_{i}' for i in range(vectorized.shape[1])])
    return pd.concat([df, tfidf_df], axis=1)

In [8]:
def add_count_features(df, ngram_range=(1, 4), max_features=50):
    df['holiday_name'] = df['holiday_name'].fillna('')

    vectorizer = CountVectorizer(
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        strip_accents='unicode',
        analyzer='word',
        ngram_range=ngram_range,
        max_features=max_features,
    )
    
    vectorized = vectorizer.fit_transform(df['holiday_name'])
    vectorized = vectorized.toarray()
    cnt_df = pd.DataFrame(vectorized, columns=[f'cnt_feat_{i}' for i in range(vectorized.shape[1])])
    return pd.concat([df, cnt_df], axis=1)

In [9]:
def add_geo_features(df):
    country_city_mapping = {
        'Prague': 'Czech Republic',
        'Brno': 'Czech Republic',
        'Budapest': 'Hungary',
        'Munich': 'Germany',
        'Frankfurt': 'Germany'
    }

    df['city'] = df['warehouse'].str.split('_').str[0]
    df['country'] = df['city'].map(country_city_mapping)

    return pd.get_dummies(df, columns=['city', 'country'])

In [10]:
def add_time_features(df):    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day'] = df['date'].dt.day
    df['sin_day'] = np.sin(2 * np.pi * df['day'] / 30)
    df['cos_day'] = np.cos(2 * np.pi * df['day'] / 30)
    df['day_of_week'] = df['date'].dt.dayofweek
    df['sin_day_of_week'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_day_of_week'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_year'] = df['date'].dt.dayofyear
    df['sin_day_of_year'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['cos_day_of_year'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    df['week'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter
    df['season'] = (df['month'] % 12 + 3) // 3
    df['season'] = df['season'].astype(int)
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_friday'] = df['day_of_week'].isin([4]).astype(int)
    
    df['day_before_holiday'] = df['holiday'].shift().fillna(0)
    df['day_after_holiday'] = df['holiday'].shift(-1).fillna(0)
    df['day_before_school_holiday'] = df['school_holidays'].shift().fillna(0)
    df['day_after_school_holiday'] = df['school_holidays'].shift(-1).fillna(0)
    df['day_before_winter_school_holiday'] = df['winter_school_holidays'].shift().fillna(0)
    df['day_after_winter_school_holiday'] = df['winter_school_holidays'].shift(-1).fillna(0)
    df['holiday_and_shops_closed'] = df['holiday'] * df['shops_closed']

    return df

In [11]:
def convert_cat_features(df):
    return pd.get_dummies(df, columns=['warehouse', 'holiday_name'])

In [12]:
def drop_useless_features(train_df, test_df):
    X_mi = train.drop(['orders', 'id'], axis=1)
    y_mi = train['orders']
    
    mutual_info = mutual_info_regression(X_mi, y_mi, random_state=SEED)
    mutual_info = pd.Series(mutual_info)
    mutual_info.index = X_mi.columns
    mutual_info = pd.DataFrame(mutual_info.sort_values(ascending=False), columns=['Feature_MI'])
    
    useless_cols = []
    for col in mutual_info.index:
        if mutual_info.loc[col, 'Feature_MI'] <= 0:
            useless_cols.append(col)
            
    train_df = train_df.drop(useless_cols, axis=1, errors='ignore')
    test_df = test_df.drop(useless_cols, axis=1, errors='ignore')
    
    return train_df, test_df, useless_cols

In [13]:
def separate_train_test(df):
    train = df[df['is_train'] == 1].drop('is_train', axis=1)
    test = df[df['is_train'] == 0].drop(['is_train', 'orders'], axis=1)

    train = train.set_index('date').sort_index()
    test = test.set_index('date').sort_index()
    
    return train, test

In [14]:
combined = add_tfidf_features(combined)
combined = add_count_features(combined)
combined = add_geo_features(combined)
combined = add_time_features(combined)
combined = convert_cat_features(combined)
train, test = separate_train_test(combined)
train, test, useless_cols = drop_useless_features(train, test)

In [15]:
tfidf_cols = [col for col in train.columns if 'tfidf_feat' in col]
cnt_cols = [col for col in train.columns if 'cnt_feat' in col]

used_tfidf_features = len(tfidf_cols) - len(set(useless_cols) & set(tfidf_cols))
used_cnt_features = len(cnt_cols) - len(set(useless_cols) & set(cnt_cols))

print(f"Used TFIDF features: {used_tfidf_features}/50")
print(f"Used Count features: {used_cnt_features}/50")
print("\nTotal features:", train.shape[1])

Used TFIDF features: 35/50
Used Count features: 28/50

Total features: 112


# Processing Data for AutoGluon

In [16]:
train['item_id'] = train['id'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[1])
test['item_id'] = test['id'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[1])

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

train = train.rename(columns={'orders': 'target'})
test['target'] = 0

train = train.reset_index(drop=False)
test = test.reset_index(drop=False)

In [17]:
train_data = TimeSeriesDataFrame.from_data_frame(train, id_column='item_id', timestamp_column='date')
test_data = TimeSeriesDataFrame.from_data_frame(test, id_column='item_id', timestamp_column='date')

train_data['is_train'] = True
test_data['is_train'] = False

combined = pd.concat([train_data, test_data])

combined = combined.convert_frequency(freq='D')
combined = combined.sort_index().fill_missing_values()

train_data = combined[combined.is_train == True]
test_data = combined[combined.is_train == False]

train_data = train_data.drop(columns=['is_train'])
test_data = test_data.drop(columns=['is_train'])

In [18]:
covariate_features = [col for col in train_data.columns if col not in ['target', 'item_id', 'date']]

# Training

In [19]:
quantile_levels = [0.1, 0.2, 0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.8, 0.9]

In [20]:
predictor = TimeSeriesPredictor(
    prediction_length=61,
    quantile_levels=quantile_levels,
    target='target',
    eval_metric='MAPE',
    known_covariates_names=covariate_features,
    freq='D'
)

predictor.fit(
    train_data,
    presets='best_quality',
    time_limit=TIME_LIMIT
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240628_052414"
Beginning AutoGluon training... Time limit = 36000s
AutoGluon will save models to 'AutogluonModels/ag-20240628_052414'
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Dec 19 13:14:11 UTC 2023
CPU Count:          4
GPU Count:          1
Memory Avail:       29.91 GB / 31.36 GB (95.4%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAPE,
 'freq': 'D',
 'hyperparameters': 'default',
 'known_covariates_names': ['holiday',
                            'shops_closed',
                            'winter_school_holidays',
                            'school_holidays',
                            'tfidf_feat_0',
                            'tfidf_feat_1',
                            'tfidf_feat_2',
                            't

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/806M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

	-0.0754       = Validation score (-MAPE)
	24.47   s     = Training runtime
	6.26    s     = Validation (prediction) runtime
Training timeseries model TemporalFusionTransformer. Training for up to 11748.1s of the 35844.2s of remaining time.
	/opt/conda/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so: undefined symbol: _ZNK5torch8autograd4Node4nameB5cxx11Ev
Training timeseries model DeepAR. Training for up to 17621.8s of the 35843.5s of remaining time.
	/opt/conda/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so: undefined symbol: _ZNK5torch8autograd4Node4nameB5cxx11Ev
Training timeseries model PatchTST. Training for up to 35243.2s of the 35843.2s of remaining time.
	/opt/conda/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so: undefined symbol: _ZNK5torch8autograd4Node4nameB5cxx11Ev
Fitting simple weighted ensemble.
	Ensemble weights: {'AutoETS': 0.38, 'Chronos[base]': 0.09, 'DirectTabular': 0.06, 'SeasonalNaive': 0.46}
	-0.0652       = Validation sc

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x78320127ddb0>

In [21]:
predictor.leaderboard()

Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.065167,11.036434,2.582356,9
1,SeasonalNaive,-0.06971,0.04389,2.105283,1
2,AutoETS,-0.071409,4.199524,15.114639,6
3,Chronos[base],-0.075448,6.263953,24.472558,8
4,DynamicOptimizedTheta,-0.082622,2.546939,21.987485,5
5,AutoARIMA,-0.096447,6.978267,19.752988,7
6,CrostonSBA,-0.108665,0.042747,9.878721,3
7,DirectTabular,-0.139654,0.529067,16.296611,2
8,NPTS,-0.208032,2.428771,2.406044,4


# Making Predictions

In [22]:
predictions = predictor.predict(
    train_data,
    known_covariates=test_data[covariate_features]
)

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [23]:
predictions.to_csv("raw_predictions.csv")
predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.55,0.6,0.65,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Brno_1,2024-03-16,9339.347783,8551.028302,8813.924952,9012.610246,9183.703511,9339.347783,9418.959992,9500.262968,9579.286769,9660.814407,9857.391443,10115.544280
Brno_1,2024-03-17,8331.649303,7525.168180,7798.224817,7997.262223,8170.361165,8331.649303,8413.259684,8492.937404,8579.963629,8672.888453,8876.669501,9152.888600
Brno_1,2024-03-18,8207.552839,7369.465163,7648.462691,7858.219982,8034.537198,8207.552839,8289.466785,8373.189238,8458.571529,8551.614756,8756.101116,9050.384122
Brno_1,2024-03-19,8710.317280,7833.438957,8132.675479,8349.425188,8534.879705,8710.317280,8792.629284,8878.375667,8967.984728,9059.086388,9275.308944,9598.264287
Brno_1,2024-03-20,8507.278807,7616.227176,7922.547597,8141.328736,8328.959527,8507.278807,8594.437779,8685.598122,8777.806660,8871.647720,9100.443424,9414.670079
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Prague_3,2024-05-11,5573.370113,4108.194050,4624.815538,4979.811026,5289.781567,5573.370113,5717.261734,5865.088839,6016.478104,6173.704291,6540.895018,7039.223604
Prague_3,2024-05-12,5323.101740,3849.025807,4356.572931,4721.743894,5036.997512,5323.101740,5464.182806,5611.916002,5765.765956,5922.427042,6282.855399,6780.239798
Prague_3,2024-05-13,5100.811338,3641.779818,4147.287100,4505.638450,4815.963567,5100.811338,5243.801778,5391.079229,5542.454019,5701.743103,6059.755750,6559.165432
Prague_3,2024-05-14,5067.952040,3617.171457,4112.176235,4470.487648,4781.985765,5067.952040,5209.974209,5352.563285,5499.940413,5659.318797,6029.825481,6527.201571


In [24]:
predictions = predictions.reset_index(drop=False)
predictions['timestamp'] = predictions['timestamp'].astype('str')
predictions['id'] = predictions['item_id'] + '_' + predictions['timestamp']

In [25]:
q = 0
def get_prediction(row):
    return predictions[predictions['id'] == row['id']][str(q)].values[0]

for _q in quantile_levels:
    q = _q
    sub = pd.DataFrame({'id': test_ids, 'orders': 0})
    sub['orders'] = sub.apply(get_prediction, axis=1)
    sub.to_csv(f'submission_{q}.csv', index=False)
    sub.head()