<a href="https://www.kaggle.com/code/priteshhhh/store-sales?scriptVersionId=199092382" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **1. Import Libraries**

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models and evaluation
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')


# **2. Load and Explore the Data**

In [2]:
# Load datasets
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'])


Preview Data

In [3]:
# Preview train data
print("Train Data Preview:")
display(train.head())

# Preview test data
print("Test Data Preview:")
display(test.head())

# Preview stores data
print("Stores Data Preview:")
display(stores.head())


Train Data Preview:


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


Test Data Preview:


Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


Stores Data Preview:


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


# **3. Data Preprocessing**

In [4]:
# Fill missing oil prices using forward fill
oil['dcoilwtico'].fillna(method='ffill', inplace=True)

# Fill any remaining missing oil prices with backward fill
oil['dcoilwtico'].fillna(method='bfill', inplace=True)


In [5]:
# Merge train and test with stores
train = pd.merge(train, stores, on='store_nbr', how='left')
test = pd.merge(test, stores, on='store_nbr', how='left')

# Merge with oil prices
train = pd.merge(train, oil, on='date', how='left')
test = pd.merge(test, oil, on='date', how='left')

# Merge with holidays/events
train = pd.merge(train, holidays_events, on='date', how='left', suffixes=('', '_holiday'))
test = pd.merge(test, holidays_events, on='date', how='left', suffixes=('', '_holiday'))

# Merge with transactions (only for train data)
train = pd.merge(train, transactions, on=['date', 'store_nbr'], how='left')


In [6]:
# Fill missing 'onpromotion' values with 0
train['onpromotion'].fillna(0, inplace=True)
test['onpromotion'].fillna(0, inplace=True)

# Fill missing 'transactions' with 0 in train data
train['transactions'].fillna(0, inplace=True)

# Fill missing 'dcoilwtico' values with the mean
train['dcoilwtico'].fillna(train['dcoilwtico'].mean(), inplace=True)
test['dcoilwtico'].fillna(test['dcoilwtico'].mean(), inplace=True)


In [7]:
# Ensure correct data types
train['onpromotion'] = train['onpromotion'].astype(int)
test['onpromotion'] = test['onpromotion'].astype(int)

train['store_nbr'] = train['store_nbr'].astype('category')
test['store_nbr'] = test['store_nbr'].astype('category')

train['family'] = train['family'].astype('category')
test['family'] = test['family'].astype('category')


# **4. Feature Engineering**

In [8]:
def create_date_features(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype('int')
    df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
    df['quarter'] = df['date'].dt.quarter
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    return df

train = create_date_features(train)
test = create_date_features(test)


In [9]:
from sklearn.preprocessing import LabelEncoder

# Combine train and test to ensure consistent encoding
combined = pd.concat([train, test], axis=0, sort=False)

# List of categorical columns to encode
cat_cols = ['family', 'city', 'state', 'type', 'cluster', 'locale', 'locale_name', 'description', 'transferred', 'type_holiday']

# Initialize LabelEncoders for each categorical column
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le

# Split back into train and test
train = combined[combined['sales'].notnull()]
test = combined[combined['sales'].isnull()]


In [10]:
# Sort data for creating lag features
train.sort_values(by=['store_nbr', 'family', 'date'], inplace=True)

# Create lag features for sales
for lag in [1, 3, 7, 14, 28]:
    train[f'lag_{lag}'] = train.groupby(['store_nbr', 'family'])['sales'].shift(lag)

# Create rolling mean features using transform
for window in [7, 14, 28]:
    train[f'rolling_mean_{window}'] = train.groupby(['store_nbr', 'family'])['sales'].shift(1).transform(lambda x: x.rolling(window).mean())

# Create exponentially weighted mean using transform
train['ewm'] = train.groupby(['store_nbr', 'family'])['sales'].shift(1).transform(lambda x: x.ewm(span=7).mean())

# Fill missing values in lag and rolling features with 0
lag_cols = [col for col in train.columns if 'lag_' in col or 'rolling_mean_' in col or col == 'ewm']
train[lag_cols] = train[lag_cols].fillna(0)


In [11]:
# Check the first few rows
train[['date', 'store_nbr', 'family', 'sales', 'ewm']].head(10)


Unnamed: 0,date,store_nbr,family,sales,ewm
0,2013-01-01,1,0,0.0,0.0
1782,2013-01-02,1,0,2.0,0.0
3564,2013-01-03,1,0,3.0,1.142857
5346,2013-01-04,1,0,3.0,1.945946
7128,2013-01-05,1,0,5.0,2.331429
8910,2013-01-06,1,0,2.0,3.206146
10692,2013-01-07,1,0,0.0,2.839323
12474,2013-01-08,1,0,2.0,2.020145
14256,2013-01-09,1,0,2.0,2.014549
16038,2013-01-10,1,0,2.0,2.010616


# **5. Prepare Data for Modeling**

In [12]:
# Define the target variable
target = 'sales'

# List of all feature columns
features = [
    # Basic features
    'store_nbr', 'family', 'onpromotion', 'transactions', 'dcoilwtico',
    # Store information
    'city', 'state', 'type', 'cluster',
    # Date features
    'year', 'month', 'day', 'day_of_week', 'week_of_year',
    'is_weekend', 'quarter', 'is_month_start', 'is_month_end',
    # Holiday and event information
    'type_holiday', 'locale', 'locale_name', 'description', 'transferred',
    # Lag and rolling features
] + lag_cols


In [13]:
# Check for missing values in features
missing_values = train[features].isnull().sum()
print("Missing values in features:")
print(missing_values[missing_values > 0])


Missing values in features:
Series([], dtype: int64)


# **6. Time Series Cross-Validation**

In [14]:
from sklearn.model_selection import TimeSeriesSplit

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)


# **7. Define RMSLE Scorer**

In [15]:
from sklearn.metrics import make_scorer

# Define RMSLE as the evaluation metric
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)


# **8. Hyperparameter Tuning with LightGBM**

In [16]:
# Parameter grid for LightGBM
param_grid = {
    'num_leaves': [31, 61],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [1000, 2000],
    'max_depth': [-1, 10],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1],
    'min_child_weight': [0.001, 0.01]
}


In [17]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42)


In [18]:
# Check for negative sales values
negative_sales = train[train['sales'] < 0]
print(f"Number of negative sales records: {len(negative_sales)}")

# Display negative sales records (optional)
display(negative_sales.head())


Number of negative sales records: 0


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,...,is_month_end,lag_1,lag_3,lag_7,lag_14,lag_28,rolling_mean_7,rolling_mean_14,rolling_mean_28,ewm


In [19]:
# Set negative sales to zero
train.loc[train['sales'] < 0, 'sales'] = 0


In [20]:
# Confirm no negative sales remain
assert (train['sales'] >= 0).all(), "There are still negative sales values."


In [21]:
# Check for negative sales values in the training data
negative_sales = train[train['sales'] < 0]
print(f"Number of negative sales records: {len(negative_sales)}")


Number of negative sales records: 0


In [22]:
# Remove records with negative sales
train = train[train['sales'] >= 0].copy()


In [23]:
# Confirm no negative sales remain
assert (train['sales'] >= 0).all(), "There are still negative sales values."


In [24]:
# List of lag feature columns
lag_cols = [col for col in train.columns if 'lag_' in col or 'rolling_mean_' in col or col == 'ewm']

# Check for negative values in lag features
negative_lag_values = train[lag_cols].lt(0).any()
print("Negative values in lag features:")
print(negative_lag_values)


Negative values in lag features:
lag_1              False
lag_3              False
lag_7              False
lag_14             False
lag_28             False
rolling_mean_7     False
rolling_mean_14    False
rolling_mean_28    False
ewm                False
dtype: bool


In [25]:
# Set negative lag feature values to zero
train[lag_cols] = train[lag_cols].clip(lower=0)


In [26]:
# Confirm no negative values in lag features
assert (train[lag_cols] >= 0).all().all(), "There are still negative values in lag features."


In [27]:
from sklearn.metrics import mean_squared_log_error, make_scorer

def rmsle(y_true, y_pred):
    # Ensure no negative values
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Update the scorer
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)


In [28]:
# Simplified parameter grid
param_grid = {
    'num_leaves': [31],
    'learning_rate': [0.01],
    'n_estimators': [500],
    'max_depth': [-1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0],
    'reg_lambda': [0],
    'min_child_weight': [0.001]
}


In [29]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=tscv,
    scoring=rmsle_scorer,
    verbose=1,
    n_jobs=-1
)


In [30]:
# Fit grid search
grid_search.fit(train[features], train[target])


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.236625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3421
[LightGBM] [Info] Number of data points in the train set: 3054348, number of used features: 32
[LightGBM] [Info] Start training from score 359.020892
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.488213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3343
[LightGBM] [Info] Number of data points in the train set: 1018116, number of used features: 32
[LightGBM] [Info] Start training from score 354.809181
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.301266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3367
[LightGBM] [Info] Number of data points in the train 

In [31]:
# Simplified parameter grid
param_grid = {
    'num_leaves': [31, 61],  # Reduced options
    'learning_rate': [0.01],
    'n_estimators': [500],  # Reduced number of estimators
    'max_depth': [5, 10],   # Limited depth to prevent overfitting
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
    'min_child_weight': [0.001]
}


In [32]:
# Sample 10% of the data for tuning
train_sample = train.sample(frac=0.1, random_state=42)


In [33]:
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Reduced for computational efficiency
    cv=tscv,
    scoring=rmsle_scorer,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit random search
random_search.fit(train[features], train[target])


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.254332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3352
[LightGBM] [Info] Number of data points in the train set: 509058, number of used features: 32
[LightGBM] [Info] Start training from score 460.562241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.070269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3374
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 32
[LightGBM] [Info] Start training from score 291.144221
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.553410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3343
[LightGBM] [Info] Number of data points in the train

KeyboardInterrupt: 

In [37]:
from sklearn.model_selection import cross_val_score

# Define the model with initial parameters
lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42)

# Define Time Series Split
tscv = TimeSeriesSplit(n_splits=3)

# Function to perform cross-validation with early stopping
def cv_with_early_stopping(params):
    model = lgb.LGBMRegressor(**params, objective='regression', random_state=42)
    cv_scores = []
    for train_index, val_index in tscv.split(train_sample):
        X_train_cv, X_val_cv = train_sample.iloc[train_index][features], train_sample.iloc[val_index][features]
        y_train_cv, y_val_cv = train_sample.iloc[train_index][target], train_sample.iloc[val_index][target]
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            early_stopping_rounds=50,
            verbose=False
        )
        y_pred = model.predict(X_val_cv, num_iteration=model.best_iteration_)
        score = rmsle(y_val_cv, y_pred)
        cv_scores.append(score)
    return np.mean(cv_scores)


In [38]:
import itertools

# List to store results
results = []

# Iterate over all combinations
for params in ParameterGrid(param_grid):
    score = cv_with_early_stopping(params)
    results.append((params, score))
    print(f"Tested params: {params}, RMSLE: {score:.5f}")


TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [39]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import seaborn as sns

# Define RMSLE function
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 61],  # Reduced options
    'learning_rate': [0.01],
    'n_estimators': [500],  # Reduced number of estimators
    'max_depth': [5, 10],   # Limited depth to prevent overfitting
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
    'min_child_weight': [0.001]
}

# Sample 10% of the data for tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Define your features and target
# Replace this with your actual features list
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'lag_1', 'lag_3', 'lag_7', 'lag_14', 'lag_28', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28', 'ewm', 'day_sin', 'day_cos']
target = 'sales'

# Define Time Series Split
tscv = TimeSeriesSplit(n_splits=3)

# Define function to perform cross-validation with early stopping
def cv_with_early_stopping(params):
    model = lgb.LGBMRegressor(**params, objective='regression', random_state=42)
    cv_scores = []
    for train_index, val_index in tscv.split(train_sample):
        X_train_cv = train_sample.iloc[train_index][features]
        X_val_cv = train_sample.iloc[val_index][features]
        y_train_cv = train_sample.iloc[train_index][target]
        y_val_cv = train_sample.iloc[val_index][target]
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            early_stopping_rounds=50,
            verbose=False
        )
        y_pred = model.predict(X_val_cv, num_iteration=model.best_iteration_)
        score = rmsle(y_val_cv, y_pred)
        cv_scores.append(score)
    return np.mean(cv_scores)

# Import ParameterGrid
from sklearn.model_selection import ParameterGrid

# List to store results
results = []

# Iterate over all combinations
for params in ParameterGrid(param_grid):
    score = cv_with_early_stopping(params)
    results.append((params, score))
    print(f"Tested params: {params}, RMSLE: {score:.5f}")

# Find the best parameters
best_params, best_score = min(results, key=lambda x: x[1])
print(f"Best Parameters: {best_params}")
print(f"Best RMSLE: {best_score:.5f}")

# Now proceed to train the model on the full training data with best_params
# Split data into training and validation sets
validation_cutoff_date = train['date'].max() - pd.DateOffset(months=1)
train_data = train[train['date'] <= validation_cutoff_date]
valid_data = train[train['date'] > validation_cutoff_date]

X_train = train_data[features]
y_train = train_data[target]
X_valid = valid_data[features]
y_valid = valid_data[target]

# Initialize model with best parameters
best_model = lgb.LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42
)

# Train the model with early stopping
best_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=50
)

# Predict on validation set
y_pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
y_pred_valid = np.maximum(0, y_pred_valid)

# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")

# Get feature importances
importances = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
})

# Sort and plot top features
importances.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(data=importances.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importances')
plt.show()

# Predict on test data
test['sales'] = best_model.predict(test[features], num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
test['sales'] = np.maximum(0, test['sales'])

# Prepare submission file
submission = test[['id', 'sales']].copy()
submission.to_csv('submission.csv', index=False)

# Preview submission
print("Submission file preview:")
display(submission.head())


TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [40]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import seaborn as sns

# Define RMSLE function
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Ensure date column is in datetime format
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Create time-based features
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['day'] = test['date'].dt.day
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Create sine and cosine transformations of the 'day' feature
train['day_sin'] = np.sin(2 * np.pi * train['day'] / 31)
train['day_cos'] = np.cos(2 * np.pi * train['day'] / 31)

test['day_sin'] = np.sin(2 * np.pi * test['day'] / 31)
test['day_cos'] = np.cos(2 * np.pi * test['day'] / 31)

# Create lag features
# Combine train and test data to create lag features
combined = pd.concat([train, test], sort=False)

# Sort combined data
combined.sort_values(by=['store_nbr', 'family', 'date'], inplace=True)

# Create lag features for combined data
for lag in [1, 3, 7, 14, 28]:
    combined[f'lag_{lag}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(lag)

# Create rolling mean features
for window in [7, 14, 28]:
    combined[f'rolling_mean_{window}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window).mean()

# Create exponentially weighted mean
combined['ewm'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).ewm(span=7).mean().reset_index(level=[0,1], drop=True)

# Split combined data back into train and test
train = combined[combined['date'] < test['date'].min()]
test = combined[combined['date'] >= test['date'].min()]

# Fill missing values in lag features
lag_cols = [col for col in train.columns if 'lag_' in col or 'rolling_mean_' in col or col == 'ewm']
train[lag_cols] = train[lag_cols].fillna(0)
test[lag_cols] = test[lag_cols].fillna(0)

# Encode categorical features
categorical_cols = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster']

from sklearn.preprocessing import LabelEncoder

for col in categorical_cols:
    le = LabelEncoder()
    combined_col = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined_col)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# Define your features and target
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos'] + lag_cols
target = 'sales'

# Sample 10% of the data for tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Define Time Series Split
tscv = TimeSeriesSplit(n_splits=3)

# Define function to perform cross-validation with early stopping
def cv_with_early_stopping(params):
    model = lgb.LGBMRegressor(**params, objective='regression', random_state=42)
    cv_scores = []
    for train_index, val_index in tscv.split(train_sample):
        X_train_cv = train_sample.iloc[train_index][features]
        X_val_cv = train_sample.iloc[val_index][features]
        y_train_cv = train_sample.iloc[train_index][target]
        y_val_cv = train_sample.iloc[val_index][target]
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            early_stopping_rounds=50,
            verbose=False
        )
        y_pred = model.predict(X_val_cv, num_iteration=model.best_iteration_)
        score = rmsle(y_val_cv, y_pred)
        cv_scores.append(score)
    return np.mean(cv_scores)

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 61],  # Reduced options
    'learning_rate': [0.01],
    'n_estimators': [500],  # Reduced number of estimators
    'max_depth': [5, 10],   # Limited depth to prevent overfitting
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
    'min_child_weight': [0.001]
}

# Import ParameterGrid
from sklearn.model_selection import ParameterGrid

# List to store results
results = []

# Iterate over all combinations
for params in ParameterGrid(param_grid):
    score = cv_with_early_stopping(params)
    results.append((params, score))
    print(f"Tested params: {params}, RMSLE: {score:.5f}")

# Find the best parameters
best_params, best_score = min(results, key=lambda x: x[1])
print(f"Best Parameters: {best_params}")
print(f"Best RMSLE: {best_score:.5f}")

# Now proceed to train the model on the full training data with best_params
# Split data into training and validation sets
validation_cutoff_date = train['date'].max() - pd.DateOffset(months=1)
train_data = train[train['date'] <= validation_cutoff_date]
valid_data = train[train['date'] > validation_cutoff_date]

X_train = train_data[features]
y_train = train_data[target]
X_valid = valid_data[features]
y_valid = valid_data[target]

# Initialize model with best parameters
best_model = lgb.LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42
)

# Train the model with early stopping
best_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=50
)

# Predict on validation set
y_pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
y_pred_valid = np.maximum(0, y_pred_valid)

# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")

# Get feature importances
importances = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
})

# Sort and plot top features
importances.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(data=importances.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importances')
plt.show()

# Predict on test data
test['sales'] = best_model.predict(test[features], num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
test['sales'] = np.maximum(0, test['sales'])

# Prepare submission file
submission = test[['id', 'sales']].copy()
submission.to_csv('submission.csv', index=False)

# Preview submission
print("Submission file preview:")
display(submission.head())


IndexError: Too many levels: Index has only 1 level, not 2

In [42]:
# Ensure 'sales' column exists in test data
test['sales'] = np.nan  # Placeholder since 'sales' is unknown in test data

# Combine train and test data to create lag features
combined = pd.concat([train, test], sort=False)

# Sort combined data
combined.sort_values(by=['store_nbr', 'family', 'date'], inplace=True)

# Create lag features for combined data
for lag in [1, 3, 7, 14, 28]:
    combined[f'lag_{lag}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(lag)

# Create rolling mean features
for window in [7, 14, 28]:
    combined[f'rolling_mean_{window}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window).mean()

# Create exponentially weighted mean
combined['ewm'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).ewm(span=7).mean()

# Fill missing values in lag features
lag_cols = [col for col in combined.columns if 'lag_' in col or 'rolling_mean_' in col or col == 'ewm']
combined[lag_cols] = combined[lag_cols].fillna(0)

# Split combined data back into train and test
train = combined[combined['date'] < test['date'].min()]
test = combined[combined['date'] >= test['date'].min()]

# Drop the placeholder 'sales' column from test data
test.drop(columns=['sales'], inplace=True)


In [43]:
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos'] + lag_cols


In [44]:
# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")


NameError: name 'y_valid' is not defined

In [45]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error

# Define RMSLE function
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Define your features and target
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos'] + lag_cols
target = 'sales'

# Split data into training and validation sets
validation_cutoff_date = train['date'].max() - pd.DateOffset(months=1)
train_data = train[train['date'] <= validation_cutoff_date].copy()
valid_data = train[train['date'] > validation_cutoff_date].copy()

X_train = train_data[features]
y_train = train_data[target]
X_valid = valid_data[features]
y_valid = valid_data[target]

# Ensure all features are available in X_train and X_valid
print("Features in X_train:", X_train.columns.tolist())
print("Features in X_valid:", X_valid.columns.tolist())

# Initialize model with best parameters
best_model = lgb.LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42
)

# Train the model with early stopping
best_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=50
)

# Predict on validation set
y_pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
y_pred_valid = np.maximum(0, y_pred_valid)

# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")


Features in X_train: ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos', 'lag_1', 'lag_3', 'lag_7', 'lag_14', 'lag_28', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28', 'ewm']
Features in X_valid: ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos', 'lag_1', 'lag_3', 'lag_7', 'lag_14', 'lag_28', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28', 'ewm']


NameError: name 'best_params' is not defined

In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import seaborn as sns

# Define RMSLE function
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Ensure date column is in datetime format
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Create time-based features
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['day'] = test['date'].dt.day
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Create sine and cosine transformations of the 'day' feature
train['day_sin'] = np.sin(2 * np.pi * train['day'] / 31)
train['day_cos'] = np.cos(2 * np.pi * train['day'] / 31)

test['day_sin'] = np.sin(2 * np.pi * test['day'] / 31)
test['day_cos'] = np.cos(2 * np.pi * test['day'] / 31)

# Ensure 'sales' column exists in test data
test['sales'] = np.nan  # Placeholder since 'sales' is unknown in test data

# Combine train and test data to create lag features
combined = pd.concat([train, test], sort=False)

# Sort combined data
combined.sort_values(by=['store_nbr', 'family', 'date'], inplace=True)

# Create lag features for combined data
for lag in [1, 3, 7, 14, 28]:
    combined[f'lag_{lag}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(lag)

# Create rolling mean features
for window in [7, 14, 28]:
    combined[f'rolling_mean_{window}'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window).mean()

# Create exponentially weighted mean
combined['ewm'] = combined.groupby(['store_nbr', 'family'])['sales'].shift(1).ewm(span=7).mean()

# Fill missing values in lag features
lag_cols = [col for col in combined.columns if 'lag_' in col or 'rolling_mean_' in col or col == 'ewm']
combined[lag_cols] = combined[lag_cols].fillna(0)

# Split combined data back into train and test
train = combined[combined['date'] < test['date'].min()].copy()
test = combined[combined['date'] >= test['date'].min()].copy()

# Drop the placeholder 'sales' column from test data
test.drop(columns=['sales'], inplace=True)

# Encode categorical features
categorical_cols = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster']

from sklearn.preprocessing import LabelEncoder

for col in categorical_cols:
    le = LabelEncoder()
    combined_col = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined_col)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# Define your features and target
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos'] + lag_cols
target = 'sales'

# Sample 10% of the data for tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Define Time Series Split
tscv = TimeSeriesSplit(n_splits=3)

# Define function to perform cross-validation with early stopping
def cv_with_early_stopping(params):
    model = lgb.LGBMRegressor(**params, objective='regression', random_state=42)
    cv_scores = []
    for train_index, val_index in tscv.split(train_sample):
        X_train_cv = train_sample.iloc[train_index][features]
        X_val_cv = train_sample.iloc[val_index][features]
        y_train_cv = train_sample.iloc[train_index][target]
        y_val_cv = train_sample.iloc[val_index][target]
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            early_stopping_rounds=50,
            verbose=False
        )
        y_pred = model.predict(X_val_cv, num_iteration=model.best_iteration_)
        score = rmsle(y_val_cv, y_pred)
        cv_scores.append(score)
    return np.mean(cv_scores)

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 61],  # Reduced options
    'learning_rate': [0.01],
    'n_estimators': [500],  # Reduced number of estimators
    'max_depth': [5, 10],   # Limited depth to prevent overfitting
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
    'min_child_weight': [0.001]
}

# List to store results
results = []

# Import ParameterGrid
from sklearn.model_selection import ParameterGrid

# Iterate over all combinations
for params in ParameterGrid(param_grid):
    score = cv_with_early_stopping(params)
    results.append((params, score))
    print(f"Tested params: {params}, RMSLE: {score:.5f}")

# Find the best parameters
best_params, best_score = min(results, key=lambda x: x[1])
print(f"Best Parameters: {best_params}")
print(f"Best RMSLE: {best_score:.5f}")

# Now proceed to train the model on the full training data with best_params
# Split data into training and validation sets
validation_cutoff_date = train['date'].max() - pd.DateOffset(months=1)
train_data = train[train['date'] <= validation_cutoff_date].copy()
valid_data = train[train['date'] > validation_cutoff_date].copy()

X_train = train_data[features]
y_train = train_data[target]
X_valid = valid_data[features]
y_valid = valid_data[target]

# Ensure all features are available in X_train and X_valid
print("Features in X_train:", X_train.columns.tolist())
print("Features in X_valid:", X_valid.columns.tolist())

# Initialize model with best parameters
best_model = lgb.LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42
)

# Train the model with early stopping
best_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=50
)

# Predict on validation set
y_pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
y_pred_valid = np.maximum(0, y_pred_valid)

# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")

# Get feature importances
importances = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
})

# Sort and plot top features
importances.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(12, 8))
sns.barplot(data=importances.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importances')
plt.show()

# Predict on test data
test['sales'] = best_model.predict(test[features], num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
test['sales'] = np.maximum(0, test['sales'])

# Prepare submission file
submission = test[['id', 'sales']].copy()
submission['id'] = submission['id'].astype(int)
submission.to_csv('submission.csv', index=False)

# Preview submission
print("Submission file preview:")
display(submission.head())


TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
import lightgbm
print('LightGBM version:', lightgbm.__version__)


In [47]:
pip install --upgrade lightgbm


Note: you may need to restart the kernel to use updated packages.


In [48]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import lightgbm.callback

from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import seaborn as sns

# Define RMSLE function
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# [Include all your data preprocessing code here]

# Define your features and target
features = ['store_nbr', 'family', 'onpromotion', 'day', 'month', 'year', 'day_sin', 'day_cos'] + lag_cols
target = 'sales'

# Sample 10% of the data for tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Define Time Series Split
tscv = TimeSeriesSplit(n_splits=3)

# Modify the cv_with_early_stopping function
def cv_with_early_stopping(params):
    model = lgb.LGBMRegressor(**params, objective='regression', random_state=42)
    cv_scores = []
    for train_index, val_index in tscv.split(train_sample):
        X_train_cv = train_sample.iloc[train_index][features]
        X_val_cv = train_sample.iloc[val_index][features]
        y_train_cv = train_sample.iloc[train_index][target]
        y_val_cv = train_sample.iloc[val_index][target]
        
        # Remove 'verbose' parameter
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            callbacks=[lgb.callback.early_stopping(stopping_rounds=50)]
        )
        y_pred = model.predict(X_val_cv, num_iteration=model.best_iteration_)
        score = rmsle(y_val_cv, y_pred)
        cv_scores.append(score)
    return np.mean(cv_scores)

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 61],
    'learning_rate': [0.01],
    'n_estimators': [500],
    'max_depth': [5, 10],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
    'min_child_weight': [0.001]
}

# List to store results
results = []

# Import ParameterGrid
from sklearn.model_selection import ParameterGrid

# Iterate over all combinations
for params in ParameterGrid(param_grid):
    score = cv_with_early_stopping(params)
    results.append((params, score))
    print(f"Tested params: {params}, RMSLE: {score:.5f}")

# Find the best parameters
best_params, best_score = min(results, key=lambda x: x[1])
print(f"Best Parameters: {best_params}")
print(f"Best RMSLE: {best_score:.5f}")

# Proceed with model training using best_params

# Split data into training and validation sets
validation_cutoff_date = train['date'].max() - pd.DateOffset(months=1)
train_data = train[train['date'] <= validation_cutoff_date].copy()
valid_data = train[train['date'] > validation_cutoff_date].copy()

X_train = train_data[features]
y_train = train_data[target]
X_valid = valid_data[features]
y_valid = valid_data[target]

# Ensure all features are available in X_train and X_valid
print("Features in X_train:", X_train.columns.tolist())
print("Features in X_valid:", X_valid.columns.tolist())

# Initialize model with best parameters
best_model = lgb.LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42
)

# Train the model without 'verbose' parameter
best_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    callbacks=[
        lgb.callback.early_stopping(stopping_rounds=50),
        lgb.callback.log_evaluation(period=50)
    ]
)

# Predict on validation set
y_pred_valid = best_model.predict(X_valid, num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
y_pred_valid = np.maximum(0, y_pred_valid)

# Calculate RMSLE
validation_rmsle = rmsle(y_valid, y_pred_valid)
print(f"Validation RMSLE: {validation_rmsle:.5f}")

# Proceed with the rest of your code for feature importance, test predictions, and submission preparation.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2643
[LightGBM] [Info] Number of data points in the train set: 76361, number of used features: 17
[LightGBM] [Info] Start training from score 357.487168
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 273.184	valid_0's l2: 74629.6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2681
[LightGBM] [Info] Number of data points in the train set: 152719, number of used features: 17
[LightGBM] [Info] Start training from score 356.776598
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 271.81	valid_0's l2

In [50]:
# Predict on test data
test['sales'] = best_model.predict(test[features], num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
test['sales'] = np.maximum(0, test['sales'])


In [51]:
# Prepare submission file
submission = test[['id', 'sales']].copy()

# Ensure 'id' is of integer type if necessary
submission['id'] = submission['id'].astype(int)

# Save submission file
submission.to_csv('submission.csv', index=False)


In [52]:
# Preview submission
print("Submission file preview:")
display(submission.head())

# Check the shape of the submission file
print(f"Submission file shape: {submission.shape}")


Submission file preview:


Unnamed: 0,id,sales
0,3000888,7.902342
1782,3002670,7.902342
3564,3004452,7.902342
5346,3006234,7.902342
7128,3008016,7.902342


Submission file shape: (28512, 2)


In [53]:
import joblib

# Save the trained model to a file
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']

In [54]:
import os

# List all files in the working directory
print("Files in /kaggle/working/:")
for file in os.listdir('/kaggle/working/'):
    print(file)


Files in /kaggle/working/:
best_model.pkl
submission.csv
.virtual_documents


In [55]:
# Make predictions on the test set
test['sales'] = best_model.predict(test[features], num_iteration=best_model.best_iteration_)

# Ensure no negative predictions
test['sales'] = np.maximum(0, test['sales'])

# Prepare the submission DataFrame
submission = test[['id', 'sales']].copy()

# Ensure 'id' is of integer type if required
submission['id'] = submission['id'].astype(int)

# Save the submission file to the working directory
submission.to_csv('submission.csv', index=False)

# Optional: Display the first few rows of the submission file
print("Submission file preview:")
display(submission.head())

# Check the shape of the submission file
print(f"Submission file shape: {submission.shape}")


Submission file preview:


Unnamed: 0,id,sales
0,3000888,7.902342
1782,3002670,7.902342
3564,3004452,7.902342
5346,3006234,7.902342
7128,3008016,7.902342


Submission file shape: (28512, 2)


In [56]:
import os

# List all files in the working directory
print("Files in /kaggle/working/:")
for file in os.listdir('/kaggle/working/'):
    print(file)


Files in /kaggle/working/:
best_model.pkl
submission.csv
.virtual_documents


In [57]:
import joblib

# Save the trained model to a file
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']

In [58]:
import json

with open('best_params.json', 'w') as f:
    json.dump(best_params, f)


In [59]:
import os

# List all files in the working directory
print("Files in /kaggle/working/:")
for file in os.listdir('/kaggle/working/'):
    print(file)


Files in /kaggle/working/:
best_model.pkl
best_params.json
submission.csv
.virtual_documents


In [60]:
import joblib

# Load the model from the file
best_model = joblib.load('best_model.pkl')


In [61]:
import json

with open('best_params.json', 'r') as f:
    best_params = json.load(f)
