In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
FOLDERNAME='ML_final_project'
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

from scripts.data_processor import DataPreprocessor

In [None]:
def wmae_score(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

In [None]:
FOLDERNAME='/content/drive/MyDrive/ML_final_project'
DATAPATH=f'{FOLDERNAME}/data/'

train_df=pd.read_csv(DATAPATH+'train.csv')
test_df=pd.read_csv(DATAPATH+'test.csv')
features_df=pd.read_csv(DATAPATH+'features.csv')
stores_df=pd.read_csv(DATAPATH+'stores.csv')

print(f"Data loaded:")
print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")
print(f"Features: {features_df.shape}")
print(f"Stores: {stores_df.shape}")

Data loaded:
Train: (421570, 5)
Test: (115064, 4)
Features: (8190, 12)
Stores: (45, 3)


In [None]:
features_df['Date']=pd.to_datetime(features_df['Date'])
train_df['Date']=pd.to_datetime(train_df['Date'])
test_df['Date']=pd.to_datetime(test_df['Date'])

initial_rows=len(train_df)
train_df=train_df[train_df['Weekly_Sales']>=0].reset_index(drop=True)

In [None]:
def time_series_split_walmart(train_data, n_splits=5):
    train_data=train_data.sort_values('Date')
    dates=train_data['Date'].unique()
    dates=np.sort(dates)

    split_size=len(dates)//(n_splits+1)
    splits=[]

    for i in range(n_splits):
        train_end_idx=(i+1) * split_size
        val_start_idx=train_end_idx
        val_end_idx=train_end_idx+split_size

        train_dates=dates[:train_end_idx]
        val_dates=dates[val_start_idx:val_end_idx]

        train_idx=train_data[train_data['Date'].isin(train_dates)].index
        val_idx=train_data[train_data['Date'].isin(val_dates)].index

        splits.append((train_idx, val_idx))

    return splits

In [None]:
def run_cross_validation(train_data):

    lgb_params={
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 63,
        'learning_rate': 0.03,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_depth': 8,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbose': -1,
        'random_state': 42,
        'n_estimators': 2000
    }

    feature_cols=[col for col in train_data.columns if col != 'Weekly_Sales']
    X=train_data[feature_cols]
    y=train_data['Weekly_Sales']

    weights=train_data['IsHoliday'].map({True: 5, False: 1}).values

    splits=time_series_split_walmart(train_data, n_splits=5)
    cv_scores=[]
    wmae_scores=[]

    for fold, (train_idx, val_idx) in enumerate(splits):
        X_train_fold=X.iloc[train_idx]
        y_train_fold=y.iloc[train_idx]
        X_val_fold=X.iloc[val_idx]
        y_val_fold=y.iloc[val_idx]
        weights_val=weights[val_idx]

        val_data_with_target=train_data.iloc[val_idx]

        preprocessor=DataPreprocessor(
            stores_df=stores_df,
            features_df=features_df,
            lag_features=[1, 2, 4, 8, 12],
            rolling_windows=[4, 8, 12]
        )

        X_train_processed=preprocessor.fit(X_train_fold, y_train_fold).transform(X_train_fold)

        X_val_processed=preprocessor.transform(val_data_with_target)

        if 'Weekly_Sales' in X_val_processed.columns:
            X_val_processed=X_val_processed.drop('Weekly_Sales', axis=1)

        model=lgb.LGBMRegressor(**lgb_params)
        model.fit(X_train_processed, y_train_fold,
                 eval_set=[(X_val_processed, y_val_fold)],
                 callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])

        y_pred=model.predict(X_val_processed)

        y_pred=np.maximum(y_pred, 0)

        mae=mean_absolute_error(y_val_fold, y_pred)
        rmse=np.sqrt(mean_squared_error(y_val_fold, y_pred))
        r2=r2_score(y_val_fold, y_pred)
        wmae=wmae_score(y_val_fold, y_pred, weights_val)

        cv_scores.append({'fold': fold, 'mae': mae, 'rmse': rmse, 'r2': r2, 'wmae': wmae})
        wmae_scores.append(wmae)

        print(f"  MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}, WMAE: {wmae:.2f}")

    avg_mae=np.mean([score['mae'] for score in cv_scores])
    avg_rmse=np.mean([score['rmse'] for score in cv_scores])
    avg_r2=np.mean([score['r2'] for score in cv_scores])
    avg_wmae=np.mean(wmae_scores)

    print(f"CV results:")
    print(f"average MAE: {avg_mae:.2f}")
    print(f"average RMSE: {avg_rmse:.2f}")
    print(f"average R2: {avg_r2:.4f}")
    print(f"average WMAE: {avg_wmae:.2f}")

    return cv_scores, avg_wmae

In [None]:
def train_model(train_data):
    final_params={
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 63,
        'learning_rate': 0.03,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_depth': 8,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbose': -1,
        'random_state': 42,
        'n_estimators': 2000
    }

    pipeline=Pipeline([
        ('preprocessor', DataPreprocessor(
            stores_df=stores_df,
            features_df=features_df,
            lag_features=[1, 2, 4, 8, 12],
            rolling_windows=[4, 8, 12]
        )),
        ('model', lgb.LGBMRegressor(**final_params))
    ])

    feature_cols=[col for col in train_data.columns if col != 'Weekly_Sales']
    X_train=train_data[feature_cols]
    y_train=train_data['Weekly_Sales']

    pipeline.fit(X_train, y_train)

    train_pred=pipeline.predict(X_train)
    train_pred=np.maximum(train_pred, 0)

    mae=mean_absolute_error(y_train, train_pred)
    rmse=np.sqrt(mean_squared_error(y_train, train_pred))
    r2=r2_score(y_train, train_pred)

    print(f"model training metrics:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")

    return pipeline

In [None]:
X_train=train_df.drop('Weekly_Sales', axis=1)
y_train=train_df['Weekly_Sales']

print(f"Training data shape: {X_train.shape}")
print(f"Target shape: {y_train.shape}")

cv_scores, avg_wmae=run_cross_validation(train_df)

my_pipeline=train_model(train_df)

Training data shape: (420285, 4)
Target shape: (420285,)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's l1: 15098.8
  MAE: 15098.83, RMSE: 23202.30, R2: 0.0001, WMAE: 15665.83
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 15135.9
  MAE: 15135.90, RMSE: 22604.59, R2: -0.0006, WMAE: 14980.71
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's l1: 15008.9
  MAE: 15008.94, RMSE: 21887.28, R2: -0.0003, WMAE: 15055.80
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 15772.4
  MAE: 15772.36, RMSE: 24701.07, R2: -0.0010, WMAE: 16078.85
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 15199.8
  MAE: 15199.76, RMSE: 22066.79, R2: -0.0002, WMAE: 15265.65
CV results:
average MAE:

In [None]:
test_predictions=my_pipeline.predict(test_df)
test_predictions=np.maximum(test_predictions, 0)

In [None]:
test_submission=test_df[['Store', 'Dept', 'Date']].copy()
test_submission['Weekly_Sales']=test_predictions

test_submission['Date_str']=pd.to_datetime(test_submission['Date']).dt.strftime('%Y-%m-%d')
test_submission['Id']=(test_submission['Store'].astype(str)+'_' +
                       test_submission['Dept'].astype(str)+'_' +
                       test_submission['Date_str'])

submission=test_submission[['Id', 'Weekly_Sales']].copy()

submission_path=f"{FOLDERNAME}/submissions/lightgbm_submission.csv"
submission.to_csv(submission_path, index=False)

print(f"submission saved to: {submission_path}")

submission saved to: /content/drive/MyDrive/ML_final_project/lightgbm_submission.csv


In [None]:
model_path=f"{FOLDERNAME}/submissions/lightgbm_pipeline.pkl"
joblib.dump(my_pipeline, model_path)
print(f"pipeline saved to: {model_path}")

pipeline saved to: /content/drive/MyDrive/ML_final_project/lightgbm_pipeline.pkl


In [None]:
print(f"submission shape: {submission.shape}")
print(f"required columns: {list(submission.columns)}")
print(f"no missing values: {submission.isnull().sum().sum() == 0}")
print(f"no negative predictions: {(submission['Weekly_Sales']>=0).all()}")

submission shape: (115064, 2)
required columns: ['Id', 'Weekly_Sales']
no missing values: True
no negative predictions: True
