In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import gc


from matplotlib import pyplot as plt
plt.style.use('seaborn-v0_8')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_PATH = "/content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/Saved Data/processed_data.parquet"

In [4]:
df = pd.read_parquet(DATA_PATH)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 31 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   wm_yr_wk            int16         
 2   weekday             category      
 3   wday                int8          
 4   month               int8          
 5   year                int16         
 6   d                   category      
 7   event_name_1        category      
 8   event_type_1        category      
 9   event_name_2        category      
 10  event_type_2        category      
 11  snap_CA             int8          
 12  snap_TX             int8          
 13  snap_WI             int8          
 14  id                  category      
 15  item_id             category      
 16  dept_id             category      
 17  cat_id              category      
 18  store_id            category      
 19  state_id            category      
 20  

In [6]:
print(df.head().to_markdown())

|    | date                |   wm_yr_wk | weekday   |   wday |   month |   year | d   | event_name_1   | event_type_1   | event_name_2   | event_type_2   |   snap_CA |   snap_TX |   snap_WI | id                            | item_id       | dept_id   | cat_id   | store_id   | state_id   |   sales_count |   sell_price |   day_of_week |   week_of_year |   is_weekend |   sales_lag_28 |   sales_lag_30 |   sales_lag_120 |   sales_lag_365 |   price_change |   price_vs_month_avg |
|---:|:--------------------|-----------:|:----------|-------:|--------:|-------:|:----|:---------------|:---------------|:---------------|:---------------|----------:|----------:|----------:|:------------------------------|:--------------|:----------|:---------|:-----------|:-----------|--------------:|-------------:|--------------:|---------------:|-------------:|---------------:|---------------:|----------------:|----------------:|---------------:|---------------------:|
|  0 | 2011-01-29 00:00:00 |      11101 | Sa

In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import gc


from matplotlib import pyplot as plt
plt.style.use('seaborn-v0_8')

from google.colab import drive
drive.mount('/content/drive')

# Added missing imports here for completeness, though already in notebook state
from sklearn.metrics import mean_squared_error
import lightgbm as lgb


# -----------------------------------------------------
# 1. Subsample the Data
# We will use data from 2014 onwards for our process.
# This makes the computation manageable for a single run on a large dataset.
print("Subsampling data for model training...")
df_model = df[df['date'] >= '2014-01-01'].copy()

# Drop the original date column as it's not a direct feature for LGBM, and 'd' handles day info
df_model = df_model.drop(columns=['date'])
gc.collect()

print(f"Shape of data for model training: {df_model.shape}")

# Calculate n_items for split size from the subsampled dataframe
n_items = len(df_model['id'].cat.categories)


# -----------------------------------------------------
# 2. Define Features and Target
# -----------------------------------------------------
features = [
    'wday', 'month', 'year', 'd', 'event_name_1', 'event_type_1',
    'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
    'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sell_price',
    'day_of_week', 'week_of_year', 'is_weekend', 'sales_lag_28',
    'sales_lag_30', 'sales_lag_120', 'sales_lag_365', 'price_change',
    'price_vs_month_avg'
]

# 'd' is a category, so including it in categorical features for LightGBM
cat_feats = [
    'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'd'
]

target = 'sales_count'

# Separate features (X) and target (y) from the subsampled data
X = df_model[features]
y = df_model[target]

# Clean up df_model to save memory
del df_model
gc.collect()


# -----------------------------------------------------
# 3. Perform a single Train/Validation Split
# The user requested no cross-validation, just a simple run.
# We will use the last 28 days worth of data as the validation set.
# This is consistent with the 'test_size' logic from the previous CV setup.

val_size = 28 * n_items # Number of rows for 28 days across all unique items

# Ensure val_size is not negative or larger than the entire dataset
if val_size <= 0 or val_size >= len(X):
    print(f"Warning: Calculated validation size ({val_size}) is problematic. Adjusting to 10% of total data for validation.")
    val_size = max(1, int(len(X) * 0.1))

# Split the data into training and validation sets
train_index = range(len(X) - val_size)
val_index = range(len(X) - val_size, len(X))

X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}")


# -----------------------------------------------------
# 4. Train the LightGBM Model
# -----------------------------------------------------
print("\nStarting LightGBM model training on a single split...")

# Use LightGBM's Dataset object for memory efficiency
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feats, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_feats, free_raw_data=False)

# Define model parameters
params = {
    'objective': 'tweedie',
    'metric': 'rmse',
    'n_estimators': 1500,
    'learning_rate': 0.05,
    'seed': 42,
    'n_jobs': -1,
    'num_leaves': 128,
    'max_bin': 127,
    'boosting_type': 'gbdt',
    'verbose': -1,
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50, verbose=False)],
)

# -----------------------------------------------------
# 5. Report Validation Score
# -----------------------------------------------------
# Make predictions and calculate RMSE for this single run
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))

print(f"\nFinished single model training.")
print(f"Validation RMSE for the hold-out set: {rmse:.4f}")

# Clean up memory
del X_train, X_val, y_train, y_val, train_data, val_data, preds
gc.collect()



Starting Time Series Cross-Validation...
--- FOLD 1 ---
Train size: 56619930, Validation size: 853720
Fold 1 RMSE: 2.3497320608391883

--- FOLD 2 ---
Train size: 57473650, Validation size: 853720


KeyboardInterrupt: 

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit # Not used directly in this version
import gc # For memory management

df.dropna(inplace=True)

# --- Subsample the Data ---
# To make Ridge Regression feasible on a large dataset, let's subsample.
# We will use data from 2014 onwards, similar to the LGBM model.
print("Subsampling data for Ridge Regression model training...")
df_ridge = df[df['date'] >= '2014-01-01'].copy()

# Drop the original date column as it's not a direct feature
df_ridge = df_ridge.drop(columns=['date'])
gc.collect()

print(f"Shape of subsampled data for Ridge Regression: {df_ridge.shape}")


# Define feature columns (X) and target column (y)
target = 'sales_count'
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                        'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
numeric_features = ['wday', 'month', 'year',
                    'snap_CA', 'snap_TX', 'snap_WI',
                    'sell_price', 'day_of_week', 'week_of_year', 'is_weekend', 'sales_lag_28',
    'sales_lag_30', 'sales_lag_120', 'sales_lag_365', 'price_change',
    'price_vs_month_avg']

# Combine all features
features = categorical_features + numeric_features

X = df_ridge[features]
y = df_ridge[target]

# Clean up df_ridge to save memory
del df_ridge
gc.collect()


# Create a preprocessor
# OneHotEncoder for categorical features
# 'passthrough' for numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', 'passthrough', numeric_features)
    ],
    remainder='drop'  # Drop any columns not specified
)

# Create the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Ridge(alpha=1.0, random_state=42))])

# --- Train on the entire (subsampled) dataset ---
print("Training Ridge Regression model on the subsampled dataset...")
print("Fitting pipeline...")
model.fit(X, y)

# Predict on the training data itself
print("Evaluating model on training data...")
train_preds = model.predict(X)

# Ensure predictions are non-negative
train_preds[train_preds < 0] = 0

# Calculate RMSE on the training data
rmse = np.sqrt(mean_squared_error(y, train_preds))
print(f"Training RMSE: {rmse:.4f}")

print("\n--- Training Complete ---")


NameError: name 'df' is not defined