In [1]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
SOURCE_MAPPING = 'store-sales-time-series-forecasting:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F29781%2F2887556%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241003%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241003T093405Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2e0affe0ef700b87c4d7ff232330959776a92caebab553f8b07d9e5108e9d98407c78512057cf4eba66371069610c36550015367fcaa761b1241c34a4748c88804f6ba81a0af846dbc68a16390fca11a0ade9c2b828625d6e00cd579100db9eec72b7411200612fba31feaa314a5a239ee85f86e5ad9d6423eee2ee30dfb272022cf819afabfb96e66c798b1afa9fdaac354d47875e7d9cbdb0303ee15af7d60aa48421235a07dd5cb6715245941ce1ad0401c9231dd1bcbe11f5be74b6817557e13139fd91b36adbad2497716ca55e18c94c4a2932db0b567e6e93dd8cf45c3c42d7d3e81dee4f605fe1737f2d20f16edf76b2b91b69f3bf96812dcf714794f'

INPUT_PATH = '/kaggle/input'
WORKING_PATH = '/kaggle/working'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree(INPUT_PATH, ignore_errors=True)
os.makedirs(INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(WORKING_PATH, 0o777, exist_ok=True)

try:
    os.symlink(INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
    pass
try:
    os.symlink(WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
    pass

for mapping in SOURCE_MAPPING.split(','):
    dir_name, url_encoded = mapping.split(':')
    download_url = unquote(url_encoded)
    filename = urlparse(download_url).path
    dest_path = os.path.join(INPUT_PATH, dir_name)
    try:
        with urlopen(download_url) as response, NamedTemporaryFile() as temp_file:
            total_length = response.headers['content-length']
            print(f'Downloading {dir_name}, {total_length} bytes compressed')
            downloaded = 0
            data_chunk = response.read(CHUNK_SIZE)
            while len(data_chunk) > 0:
                downloaded += len(data_chunk)
                temp_file.write(data_chunk)
                progress = int(50 * downloaded / int(total_length))
                sys.stdout.write(f"\r[{'=' * progress}{' ' * (50-progress)}] {downloaded} bytes downloaded")
                sys.stdout.flush()
                data_chunk = response.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
                with ZipFile(temp_file) as zip_file:
                    zip_file.extractall(dest_path)
            else:
                with tarfile.open(temp_file.name) as tar_file:
                    tar_file.extractall(dest_path)
            print(f'\nDownloaded and uncompressed: {dir_name}')
    except HTTPError:
        print(f'Failed to load (likely expired) {download_url} to path {dest_path}')
        continue
    except OSError:
        print(f'Failed to load {download_url} to path {dest_path}')
        continue

print('Data source import complete.')


Downloading store-sales-time-series-forecasting, 22416355 bytes compressed
Downloaded and uncompressed: store-sales-time-series-forecasting
Data source import complete.


In [2]:
import numpy as np
import pandas as pd

import os
for path, _, files in os.walk('/kaggle/input'):
    for file in files:
        print(os.path.join(path, file))


/kaggle/input/store-sales-time-series-forecasting/transactions.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv


In [3]:
use_data_since_2017 = True

In [4]:
!pip install catboost
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
sample_submission_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')

train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

if use_data_since_2017:
    train_df = train_df[train_df['date'] >= '2017-01-01']
    test_df = test_df[test_df['date'] >= '2017-01-01']

train_df['holiday'] = train_df['date'].isin(holidays_df['date'])
test_df['holiday'] = test_df['date'] == pd.to_datetime('2017-08-24')

for df in [train_df, test_df]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday

train_df.drop(columns=['date'], inplace=True)
test_df.drop(columns=['date'], inplace=True)

cat_cols = train_df.select_dtypes(include=['object']).columns
train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

print(train_df.head())
print(test_df.head())


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


  train_df['holiday'] = train_df['date'].isin(holidays_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['holiday'] = train_df['date'].isin(holidays_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['date'].dt.year


              id  store_nbr  sales  onpromotion  holiday  year  month  day  \
2596374  2596374          1    0.0            0     True  2017      1    1   
2596375  2596375          1    0.0            0     True  2017      1    1   
2596376  2596376          1    0.0            0     True  2017      1    1   
2596377  2596377          1    0.0            0     True  2017      1    1   
2596378  2596378          1    0.0            0     True  2017      1    1   

         weekday  family_BABY CARE  ...  family_MAGAZINES  family_MEATS  \
2596374        6             False  ...             False         False   
2596375        6              True  ...             False         False   
2596376        6             False  ...             False         False   
2596377        6             False  ...             False         False   
2596378        6             False  ...             False         False   

         family_PERSONAL CARE  family_PET SUPPLIES  \
2596374                 Fa

In [5]:
X = train_df.drop(columns=['sales'])
y = train_df['sales']

y_log = np.log1p(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_val.columns = X_val.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test_df.columns = test_df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

X_train, test_df = X_train.align(test_df, join='left', axis=1, fill_value=0)
X_val, test_df = X_val.align(test_df, join='left', axis=1, fill_value=0)

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)


In [6]:
iterations = 100
learning_rate = 0.1
train_loop_count = 100
retrain_loop_count = 10


In [7]:
catboost_model = CatBoostRegressor(
    iterations=iterations,
    learning_rate=learning_rate,
    depth=8,
    random_seed=42,
    loss_function='RMSE',
    verbose=100
)


In [10]:
catboost_model.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=50,
    verbose=50,
    use_best_model=True,
    init_model=None
)

for _ in range(min(train_loop_count, 10)):
    catboost_model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=50,
        verbose=50,
        use_best_model=True,
        init_model=catboost_model
    )


0:	learn: 2.4564269	test: 2.4615474	best: 2.4615474 (0)	total: 52.5ms	remaining: 4m 22s
50:	learn: 1.0037894	test: 1.0113837	best: 1.0113837 (50)	total: 2.55s	remaining: 4m 7s
100:	learn: 0.8292170	test: 0.8353883	best: 0.8353883 (100)	total: 5.02s	remaining: 4m 3s
150:	learn: 0.7470728	test: 0.7520339	best: 0.7520339 (150)	total: 8.52s	remaining: 4m 33s
200:	learn: 0.6911606	test: 0.6951191	best: 0.6951191 (200)	total: 11.7s	remaining: 4m 40s
250:	learn: 0.6513529	test: 0.6544782	best: 0.6544782 (250)	total: 14.2s	remaining: 4m 29s
300:	learn: 0.6197252	test: 0.6224639	best: 0.6224639 (300)	total: 16.7s	remaining: 4m 21s
350:	learn: 0.5971624	test: 0.5997525	best: 0.5997525 (350)	total: 19.2s	remaining: 4m 14s
400:	learn: 0.5786458	test: 0.5816913	best: 0.5816913 (400)	total: 23.5s	remaining: 4m 29s
450:	learn: 0.5631369	test: 0.5661167	best: 0.5661167 (450)	total: 26s	remaining: 4m 22s
500:	learn: 0.5496231	test: 0.5528963	best: 0.5528963 (500)	total: 28.5s	remaining: 4m 15s
550:	lea

KeyboardInterrupt: 

In [11]:
from catboost import Pool

august_pool = Pool(data=X_train, label=y_train)

for _ in range(retrain_loop_count):
    catboost_model.fit(
        august_pool,
        verbose=50,
        init_model=catboost_model
    )


0:	learn: 0.3816726	total: 50ms	remaining: 4m 9s
50:	learn: 0.3813423	total: 2.52s	remaining: 4m 4s
100:	learn: 0.3810366	total: 6.75s	remaining: 5m 27s
150:	learn: 0.3807043	total: 9.24s	remaining: 4m 56s
200:	learn: 0.3804033	total: 11.7s	remaining: 4m 39s
250:	learn: 0.3800850	total: 14.1s	remaining: 4m 27s
300:	learn: 0.3796941	total: 17.2s	remaining: 4m 28s
350:	learn: 0.3793108	total: 20.8s	remaining: 4m 35s
400:	learn: 0.3789043	total: 23.3s	remaining: 4m 27s
450:	learn: 0.3785009	total: 25.8s	remaining: 4m 20s
500:	learn: 0.3781529	total: 28.2s	remaining: 4m 13s
550:	learn: 0.3777938	total: 32.3s	remaining: 4m 20s
600:	learn: 0.3774264	total: 34.9s	remaining: 4m 15s
650:	learn: 0.3770920	total: 37.4s	remaining: 4m 9s
700:	learn: 0.3767219	total: 39.9s	remaining: 4m 4s
750:	learn: 0.3763723	total: 42.4s	remaining: 3m 59s
800:	learn: 0.3760415	total: 46.6s	remaining: 4m 4s
850:	learn: 0.3757069	total: 49s	remaining: 3m 59s
900:	learn: 0.3753646	total: 51.5s	remaining: 3m 54s
950:

KeyboardInterrupt: 

In [12]:
from sklearn.metrics import mean_squared_error

# Generate predictions on the validation set
y_val_pred = catboost_model.predict(X_val)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'RMSE: {rmse}')


RMSE: 0.3799323732978071


In [13]:
test_predictions_log = catboost_model.predict(test_df)

test_df['sales'] = np.expm1(test_predictions_log)

test_df['sales'] = np.where(test_df['sales'] < 0, 0, test_df['sales'])


In [14]:
submission = test_df[['id', 'sales']]

submission.to_csv('submission.csv', index=False)

print(submission)


            id        sales
0      3000888     4.589866
1      3000889     0.000000
2      3000890     5.253364
3      3000891  2318.820519
4      3000892     0.140984
...        ...          ...
28507  3029395   341.786658
28508  3029396    94.114781
28509  3029397  1144.522938
28510  3029398    76.616239
28511  3029399    16.108102

[28512 rows x 2 columns]


In [15]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>