In [1]:
# LSTM Forecasting Pipeline (Final Version with CSV-safe Preprocessing)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from sklearn.model_selection import train_test_split
import os


2025-07-23 22:21:10.747732: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-23 22:21:10.747824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-23 22:21:10.847766: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-23 22:21:11.041200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# -------------------------
# 1. Load Data
# -------------------------
train_path = 'train.csv'
test_path = 'test.csv'

if not os.path.exists(train_path) or not os.path.exists(test_path):
    raise FileNotFoundError("Check if the train and test CSV paths are correct.")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


In [3]:

# -------------------------
# 2. Preprocessing Pipeline
# -------------------------
def preprocess(df, is_train=True):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
    
    if is_train:
        store_dummies = pd.get_dummies(df['store'], prefix='store')
        item_dummies = pd.get_dummies(df['item'], prefix='item')
        df = pd.concat([df, store_dummies, item_dummies], axis=1)
        df.drop(['store', 'item'], axis=1, inplace=True)
    return df

train_df = preprocess(train_df, is_train=True)
test_df = preprocess(test_df, is_train=True)  # Note: ensure columns match

# Align columns between train and test
train_cols = set(train_df.columns) - {'sales'}
test_missing = list(train_cols - set(test_df.columns))
for col in test_missing:
    test_df[col] = 0

test_df = test_df[[col for col in train_df.columns if col != 'sales']]

In [None]:


# -------------------------
# 3. Prepare Sequences
# -------------------------
lookback = 30

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

train_features = train_df.drop(columns=['date', 'sales'])
train_target = train_df['sales'].values.reshape(-1, 1)

scaled_features = scaler_x.fit_transform(train_features)
scaled_target = scaler_y.fit_transform(train_target)

X_train, y_train = [], []
for i in range(lookback, len(scaled_features)):
    X_train.append(scaled_features[i - lookback:i])
    y_train.append(scaled_target[i])

X_train, y_train = np.array(X_train), np.array(y_train)

In [None]:


# -------------------------
# 4. Build & Train LSTM Model
# -------------------------
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

# -------------------------
# 5. Predict on Test Data
# -------------------------
scaled_test = scaler_x.transform(test_df.drop(columns=['date']))

X_test = []
for i in range(lookback, len(scaled_test)):
    X_test.append(scaled_test[i - lookback:i])
X_test = np.array(X_test)

pred_scaled = model.predict(X_test)
pred_sales = scaler_y.inverse_transform(pred_scaled).flatten()

# Fill predictions into test_df
test_result = test_df[lookback:].copy()
test_result['sales_predicted'] = pred_sales

# -------------------------
# 6. Visualization
# -------------------------
# Sales over time for a sample store/item
plt.figure(figsize=(15, 4))
sample_df = train_df.copy()
sample_df['date'] = pd.to_datetime(sample_df['date'])
sample_df = sample_df.sort_values('date')
plt.plot(sample_df['date'], train_target, label='Actual Sales')
plt.title("Sales Over Time")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Validation Plot
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)
model.fit(X_subtrain, y_subtrain, epochs=2, batch_size=128, verbose=0)
y_val_pred = model.predict(X_val)

plt.figure(figsize=(10, 4))
plt.plot(scaler_y.inverse_transform(y_val[-100:]), label='Actual')
plt.plot(scaler_y.inverse_transform(y_val_pred[-100:]), label='Predicted')
plt.title("Validation: Actual vs Predicted Sales (last 100 points)")
plt.xlabel("Time Step")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Feature Correlation
plt.figure(figsize=(8, 6))
numeric_cols = ['year', 'month', 'day', 'dayofweek', 'is_weekend', 'sales']
corr = train_df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

# Prediction Distribution
plt.figure(figsize=(6, 4))
sns.histplot(pred_sales, bins=50, kde=True)
plt.title("Predicted Sales Distribution (Test Set)")
plt.xlabel("Predicted Sales")
plt.tight_layout()
plt.show()

# Snapshot for a few Store-Item combinations (if dummies exist)
store_cols = [col for col in test_result.columns if col.startswith('store_')]
item_cols = [col for col in test_result.columns if col.startswith('item_')]

if store_cols and item_cols:
    plt.figure(figsize=(12, 6))
    test_result['store_id'] = test_result[store_cols].idxmax(axis=1).apply(lambda x: x.split('_')[1])
    test_result['item_id'] = test_result[item_cols].idxmax(axis=1).apply(lambda x: x.split('_')[1])
    unique_pairs = test_result[['store_id', 'item_id']].drop_duplicates().head(4)
    for i, row in unique_pairs.iterrows():
        sub = test_result[(test_result['store_id'] == row['store_id']) & (test_result['item_id'] == row['item_id'])]
        plt.bar(f"Store {row['store_id']}, Item {row['item_id']}", sub['sales_predicted'].mean())
    plt.title("Predicted Sales for Selected Store-Item Pairs (Test)")
    plt.ylabel("Avg Predicted Sales")
    plt.tight_layout()
    plt.show()