In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib

In [3]:
# Function to load datasets
def load_data():
    sales_train_df = pd.read_csv('/Users/bananavodka/Projects/at2_mla/at2_mla/data/raw/sales_train.csv')
    calendar_df = pd.read_csv('/Users/bananavodka/Projects/at2_mla/at2_mla/data/raw/calendar.csv')
    calendar_events_df = pd.read_csv('/Users/bananavodka/Projects/at2_mla/at2_mla/data/raw/calendar_events.csv')
    sales_test_df = pd.read_csv('/Users/bananavodka/Projects/at2_mla/at2_mla/data/raw/sales_test.csv')
    items_weekly_sales_df = pd.read_csv('/Users/bananavodka/Projects/at2_mla/at2_mla/data/raw/items_weekly_sell_prices.csv')
    
    print("Datasets loaded successfully.")
    return sales_train_df, calendar_df, calendar_events_df, sales_test_df, items_weekly_sales_df

In [4]:
# Function to preprocess and merge datasets
def preprocess_data(sales_train_df, calendar_df, calendar_events_df, items_weekly_sales_df):
    # Melt sales data
    sales_train_melted = pd.melt(
        sales_train_df,
        id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
        var_name='d',
        value_name='sales'
    )
    
    # Merge with calendar data
    sales_train_merged = pd.merge(sales_train_melted, calendar_df, how='left', on='d')

    # Merge with event data
    sales_train_with_events = pd.merge(sales_train_merged, calendar_events_df, how='left', on='date')

    # Merge with item prices
    sales_train_with_prices = pd.merge(sales_train_with_events, items_weekly_sales_df, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])

    # Fill missing event names and types
    sales_train_with_prices['event_name'].fillna('No Event', inplace=True)
    sales_train_with_prices['event_type'].fillna('None', inplace=True)

    # Forward-fill missing prices
    sales_train_with_prices['sell_price'].fillna(method='ffill', inplace=True)
    sales_train_with_prices['sell_price'].fillna(
        sales_train_with_prices.groupby('item_id')['sell_price'].transform('mean'), 
        inplace=True
    )

    print("Data preprocessed successfully.")
    return sales_train_with_prices

In [5]:
# Function to create features
def create_features(sales_train_with_prices):
    # Convert 'date' to datetime
    sales_train_with_prices['date'] = pd.to_datetime(sales_train_with_prices['date'])

    # Create date-based features
    sales_train_with_prices['day'] = sales_train_with_prices['date'].dt.day
    sales_train_with_prices['month'] = sales_train_with_prices['date'].dt.month
    sales_train_with_prices['weekday'] = sales_train_with_prices['date'].dt.weekday

    # Label encoding
    encoder_store = LabelEncoder()
    encoder_item = LabelEncoder()
    sales_train_with_prices['store_id'] = encoder_store.fit_transform(sales_train_with_prices['store_id'])
    sales_train_with_prices['item_id'] = encoder_item.fit_transform(sales_train_with_prices['item_id'])

    # Initialize label encoders for event names and types
    le_event_name = LabelEncoder()
    le_event_type = LabelEncoder()
    sales_train_with_prices['event_name_encoded'] = le_event_name.fit_transform(sales_train_with_prices['event_name'])
    sales_train_with_prices['event_type_encoded'] = le_event_type.fit_transform(sales_train_with_prices['event_type'])

    print("Features created successfully.")
    return sales_train_with_prices, encoder_store, encoder_item, le_event_name, le_event_type

In [6]:

# Function to define features and split data
def define_and_split_features(sales_train_with_prices):
    features = ['store_id', 'item_id', 'day', 'month', 'weekday', 'sell_price', 'event_name_encoded', 'event_type_encoded']
    X = sales_train_with_prices[features]
    y = sales_train_with_prices['sales']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    print("Data split into training and testing sets.")
    return X_train, X_test, y_train, y_test

In [7]:
# Function to train Ridge model
def train_ridge(X_train, y_train):
    ridge_model = Ridge()
    ridge_model.fit(X_train, y_train)
    print("Ridge model trained successfully.")
    return ridge_model

In [8]:
# Function to evaluate model
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Test RMSE with {model_name}: {rmse}")
    return rmse

In [9]:
# Function to train XGBoost model
def train_xgboost(X_train, y_train):
    xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    print("XGBoost model trained successfully.")
    return xgb_model

In [10]:
# Main function to execute the entire pipeline
def main():
    # Load data
    sales_train_df, calendar_df, calendar_events_df, sales_test_df, items_weekly_sales_df = load_data()

    # Preprocess data
    sales_train_with_prices = preprocess_data(sales_train_df, calendar_df, calendar_events_df, items_weekly_sales_df)

    # Create features
    sales_train_with_prices, encoder_store, encoder_item, le_event_name, le_event_type = create_features(sales_train_with_prices)

    # Define and split features
    X_train, X_test, y_train, y_test = define_and_split_features(sales_train_with_prices)

    # Train Ridge model
    ridge_model = train_ridge(X_train, y_train)

    # Evaluate Ridge model
    evaluate_model(ridge_model, X_test, y_test, model_name="Ridge Regression")
    
    # Train XGBoost model
    X_train_subset = X_train.sample(frac=0.10, random_state=42)
    y_train_subset = y_train[X_train_subset.index]
    xgb_model = train_xgboost(X_train_subset, y_train_subset)

    # Evaluate XGBoost model
    evaluate_model(xgb_model, X_test, y_test, model_name="XGBoost")

if __name__ == "__main__":
    main()

Datasets loaded successfully.
Data preprocessed successfully.
Features created successfully.
Data split into training and testing sets.
Ridge model trained successfully.
Test RMSE with Ridge Regression: 3.5442486407875036
XGBoost model trained successfully.
Test RMSE with XGBoost: 3.0138805175029355
