In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [11]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import early_stopping, log_evaluation

# Load the datasets
store = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

# Preprocess the data
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
oil['date'] = pd.to_datetime(oil['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
transactions['date'] = pd.to_datetime(transactions['date'])

# Merge datasets on date and store information
train = train.merge(store, on='store_nbr', how='left', suffixes=('', '_store'))
train = train.merge(oil, on='date', how='left', suffixes=('', '_oil'))
train = train.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))
train = train.merge(transactions, on=['date', 'store_nbr'], how='left', suffixes=('', '_transaction'))

test = test.merge(store, on='store_nbr', how='left', suffixes=('', '_store'))
test = test.merge(oil, on='date', how='left', suffixes=('', '_oil'))
test = test.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))
test = test.merge(transactions, on=['date', 'store_nbr'], how='left', suffixes=('', '_transaction'))

# Combine train and test for consistent feature engineering
test['sales'] = None  # Add placeholder for sales in test
combined = pd.concat([train, test])

# Fill missing oil values by interpolation
combined['dcoilwtico'].interpolate(method='linear', inplace=True)

# Date-related features
combined['day_of_week'] = combined['date'].dt.dayofweek
combined['month'] = combined['date'].dt.month
combined['day'] = combined['date'].dt.day
combined['year'] = combined['date'].dt.year
combined['is_weekend'] = combined['day_of_week'].isin([5, 6]).astype(int)

# Rolling and lag features on sales
combined['sales_lag_7'] = combined.groupby('store_nbr')['sales'].shift(7)
combined['sales_lag_30'] = combined.groupby('store_nbr')['sales'].shift(30)
combined['sales_rolling_mean_7'] = combined.groupby('store_nbr')['sales'].shift(1).rolling(7).mean()
combined['sales_rolling_std_7'] = combined.groupby('store_nbr')['sales'].shift(1).rolling(7).std()
combined['sales_rolling_mean_30'] = combined.groupby('store_nbr')['sales'].shift(1).rolling(30).mean()

# Split combined data back into train and test
train = combined[combined['sales'].notnull()]
test = combined[combined['sales'].isnull()]

# Encode categorical columns
label_encoder = LabelEncoder()
train['family'] = label_encoder.fit_transform(train['family'])
test['family'] = label_encoder.transform(test['family'])

# f['sales_lag_{lag}'] = train.groupby('store_nbr')['sales'].shift(lag)

# Rolling features
train['sales_rolling_mean_7'] = train.groupby('store_nbr')['sales'].transform(lambda x: x.rolling(7).mean())
train['sales_rolling_std_7'] = train.groupby('store_nbr')['sales'].transform(lambda x: x.rolling(7).std())

# Drop NA values created by lag features
train.dropna(inplace=True)


# Define features and target
features = [
    'store_nbr', 'family', 'dcoilwtico', 'transactions',
    'day_of_week', 'month', 'day', 'year', 'is_weekend',
    'sales_lag_7', 'sales_lag_30', 'sales_rolling_mean_7', 'sales_rolling_std_7', 'sales_rolling_mean_30'
]
X = train[features]
y = train['sales']

# Split the data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'subsample': 0.8,
    'subsample_freq': 1,
    'seed': 42
}


  combined = pd.concat([train, test])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['dcoilwtico'].interpolate(method='linear', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['family'] = label_encoder.fit_transform(train['family'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [12]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_valid],
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(100)
    ]
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1931
[LightGBM] [Info] Number of data points in the train set: 367224, number of used features: 14
[LightGBM] [Info] Start training from score 406.060877
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 283.313	valid_1's rmse: 403.345
[200]	training's rmse: 239.165	valid_1's rmse: 384.252
[300]	training's rmse: 213.138	valid_1's rmse: 383.105
Early stopping, best iteration is:
[228]	training's rmse: 230.386	valid_1's rmse: 382.625


In [13]:
# Validate the model
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

# Make predictions on the test data
test['sales'] = model.predict(test[features], num_iteration=model.best_iteration)

# Prepare the submission file
submission = test[['id', 'sales']]
submission.to_csv('submission.csv', index=False)

Validation RMSE: 382.62521790480656


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['sales'] = model.predict(test[features], num_iteration=model.best_iteration)


In [14]:
print(test[['id', 'sales']].head())

        id       sales
0  3000888 -146.139084
1  3000889    2.940402
2  3000890    0.738095
3  3000891  249.501257
4  3000892  -30.982587


In [16]:
# Assuming y_pred contains your predictions
y_pred = model.predict(test[features])
# Clip predictions to be non-negative
y_pred = np.maximum(0, y_pred)

# Prepare the submission file
submission = test[['id']].copy()
submission['sales'] = y_pred
submission.to_csv('/kaggle/working/submission.csv', index=False)


In [17]:
print(y_pred)

[  0.           2.9404018    0.73809474 ... 171.65995453   1.56831474
   1.56831474]
