In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
def load_data(file_path):
    """Load the sales data."""
    return pd.read_csv(file_path)

In [None]:
def preprocess_data(df):
    """Preprocess the data including handling missing values and feature engineering."""
    # Convert date to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Handle missing values
    df = df.fillna(method='ffill')
    
    # Feature engineering
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    
    # Create lag features
    for lag in [1, 2, 3, 4]:
        df[f'Sales_Lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)
    
    # Create rolling mean features
    for window in [7, 28]:
        df[f'Sales_Rolling_Mean_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
    
    # Drop rows with NaN values after feature engineering
    df = df.dropna()
    
    return df

In [None]:
def normalize_data(df):
    """Normalize numerical features using Min-Max scaling."""
    scaler = MinMaxScaler()
    numerical_columns = ['Weekly_Sales', 'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4',
                         'Sales_Rolling_Mean_7', 'Sales_Rolling_Mean_28']
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

In [None]:

def split_data(df):
    """Split the data into training, validation, and test sets."""
    train_val, test = train_test_split(df, test_size=0.15, shuffle=False)
    train, val = train_test_split(train_val, test_size=0.15, shuffle=False)
    return train, val, test


In [None]:
# Load data
df = load_data('data/walmart_sales.csv')

# Preprocess data
df = preprocess_data(df)

# Normalize data
df, scaler = normalize_data(df)

# Split data
train, val, test = split_data(df)

# Save processed datasets
train.to_csv('data/train.csv', index=False)
val.to_csv('data/val.csv', index=False)
test.to_csv('data/test.csv', index=False)

print("Data preprocessing completed.")