In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import uuid

# Set random seed for reproducibility
np.random.seed(42)

# Load the dataset from file path
df = pd.read_csv('/content/Walmart.csv')

# --- Exploratory Data Analysis (EDA) ---
# 1. Display basic info and summary statistics
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# 2. Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert Date to datetime and extract temporal features
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Is_Holiday_Week'] = df['Holiday_Flag'].astype(int)

# 4. Add Holiday_Season and lagged holiday flags
df['Holiday_Season'] = 0
for i in range(len(df)):
    if df.iloc[i]['Holiday_Flag'] == 1 or (i > 0 and df.iloc[i-1]['Holiday_Flag'] == 1) or (i < len(df)-1 and df.iloc[i+1]['Holiday_Flag'] == 1):
        df.loc[df.index[i-2:i+2], 'Holiday_Season'] = 1
for lag in range(1, 4):
    df[f'Lag_Holiday_{lag}'] = df.groupby('Store')['Holiday_Flag'].shift(lag).fillna(0)

# 5. Add Weekly_Sales_Diff, Rolling_Mean_26, and Rolling_Std_26
df['Weekly_Sales_Diff'] = df.groupby('Store')['Weekly_Sales'].shift(1).diff().fillna(0)
df['Rolling_Mean_26'] = df.groupby('Store')['Weekly_Sales'].shift(1).rolling(window=26, min_periods=1).mean()
df['Rolling_Std_26'] = df.groupby('Store')['Weekly_Sales'].shift(1).rolling(window=26, min_periods=1).std().fillna(0)

# --- Exploratory Data Analysis (EDA) Visualizations ---
# Visualization 1: Correlation Heatmap
plt.figure(figsize=(10, 6))
correlation_matrix = df[['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month', 'Year', 'Holiday_Season', 'Rolling_Mean_26', 'Rolling_Std_26']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.savefig('/content/correlation_heatmap.png')
plt.close()

# Visualization 2: Weekly Sales Trend per Store
plt.figure(figsize=(12, 6))
for store in df['Store'].unique()[:5]:  # Plot first 5 stores
    store_data = df[df['Store'] == store]
    plt.plot(store_data['Date'], store_data['Weekly_Sales'], label=f'Store {store}')
plt.title('Weekly Sales Trend by Store')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.savefig('/content/weekly_sales_trend.png')
plt.close()

# --- Data Preprocessing ---
# Sort by Store and Date
df = df.sort_values(by=['Store', 'Date'])

# Create lagged features (previous 1 to 8 weeks' sales)
for lag in range(1, 9):
    df[f'Lag_{lag}'] = df.groupby('Store')['Weekly_Sales'].shift(lag)
df = df.dropna().reset_index(drop=True)

# Normalize Store feature
df['Store_Norm'] = MinMaxScaler().fit_transform(df[['Store']])

# Select features and target
numeric_features = ['Store_Norm', 'Holiday_Flag', 'Is_Holiday_Week', 'Holiday_Season'] + [f'Lag_Holiday_{lag}' for lag in range(1, 4)] + ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month', 'Year', 'Weekly_Sales_Diff', 'Rolling_Mean_26', 'Rolling_Std_26'] + [f'Lag_{lag}' for lag in range(1, 9)]
target = 'Weekly_Sales'

# Prepare features and target
X = df[numeric_features].reset_index(drop=True)
y = df[[target]].reset_index(drop=True)

# Normalize features and target
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)

# Create sequences for LSTM
def create_sequences(X, y, time_steps=52):
    Xs, ys = [], []
    store_groups = df.groupby('Store')
    for store, group in store_groups:
        X_store = X[group.index]
        y_store = y[group.index]
        for i in range(len(X_store) - time_steps):
            Xs.append(X_store[i:(i + time_steps)])
            ys.append(y_store[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 52
X_seq, y_seq = create_sequences(X_scaled, y_scaled, time_steps)

# Split into train and test sets (80-20)
train_size = int(0.8 * len(X_seq))
test_size = len(X_seq) - train_size

X_train = X_seq[:train_size]
y_train = y_seq[:train_size]
X_test = X_seq[train_size:]
y_test = y_seq[train_size:]

print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")

# --- Build and Train LSTM Model ---
model = Sequential([
    Bidirectional(LSTM(64, activation='tanh', input_shape=(time_steps, X_train.shape[2]), return_sequences=True)),
    BatchNormalization(),
    Dropout(0.35),
    LSTM(32, activation='tanh', return_sequences=True),
    BatchNormalization(),
    Dropout(0.35),
    LSTM(32, activation='tanh'),
    BatchNormalization(),
    Dropout(0.35),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.0005), loss='mse')
model.summary()

# Early stopping to prevent overfitting (using training loss as a proxy)
early_stopping = EarlyStopping(monitor='loss', patience=15, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# --- Evaluate the Model ---
# Predict on test set
y_pred = model.predict(X_test)
y_test_inv = scaler.inverse_transform(np.hstack((np.zeros((y_test.shape[0], X.shape[1]-1)), y_test)))
y_pred_inv = scaler.inverse_transform(np.hstack((np.zeros((y_pred.shape[0], X.shape[1]-1)), y_pred)))

# Calculate test metrics
mae = mean_absolute_error(y_test_inv[:, -1], y_pred_inv[:, -1])
rmse = np.sqrt(mean_squared_error(y_test_inv[:, -1], y_pred_inv[:, -1]))
r2 = r2_score(y_test_inv[:, -1], y_pred_inv[:, -1])

print("\nTest Set Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")

# Save metrics to file
with open('/content/evaluation_metrics.txt', 'w') as f:
    f.write(f"Test Set - Mean Absolute Error (MAE): ${mae:.2f}\n")
    f.write(f"Test Set - Root Mean Squared Error (RMSE): ${rmse:.2f}\n")
    f.write(f"Test Set - R-squared (R²): {r2:.4f}\n")

# Plot training history
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/training_loss.png')
plt.close()

# Plot predictions vs actuals (Test set, first 100 and full)
plt.figure(figsize=(12, 6))
plt.plot(y_test_inv[:100, -1], label='Actual Sales (First 100)')
plt.plot(y_pred_inv[:100, -1], label='Predicted Sales (First 100)')
plt.title('Actual vs Predicted Weekly Sales (First 100 Test Samples)')
plt.xlabel('Sample')
plt.ylabel('Weekly Sales')
plt.legend()
plt.savefig('/content/predictions_vs_actuals_100.png')
plt.close()

plt.figure(figsize=(12, 6))
plt.plot(y_test_inv[:, -1], label='Actual Sales (Full)')
plt.plot(y_pred_inv[:, -1], label='Predicted Sales (Full)')
plt.title('Actual vs Predicted Weekly Sales (Full Test Set)')
plt.xlabel('Sample')
plt.ylabel('Weekly Sales')
plt.legend()
plt.savefig('/content/predictions_vs_actuals_full.png')
plt.close()

# Residual plot
residuals = y_test_inv[:100, -1] - y_pred_inv[:100, -1]
plt.figure(figsize=(12, 6))
plt.scatter(range(100), residuals, c='blue', label='Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals of Predictions (First 100 Test Samples)')
plt.xlabel('Sample')
plt.ylabel('Residual (Actual - Predicted)')
plt.legend()
plt.savefig('/content/residuals.png')
plt.close()

# Save the model
model.save('/content/walmart_lstm_model_optimized.h5')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB
None

Summary Statistics:
             Store  Weekly_Sales  Holiday_Flag  Temperature   Fuel_Price  \
count  6435.000000  6.435000e+03   6435.000000  6435.000000  6435.000000   
mean     23.000000  1.046965e+06      0.069930    60.663782     3.358607   
std      12.988182  5.643666e+05      0.255049    18.444933     0.459020   
min       1.000000  2.099862e+05      0.000000    -2.060

  super().__init__(**kwargs)


Epoch 1/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 0.8737
Epoch 2/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - loss: 0.2820
Epoch 3/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 0.1782
Epoch 4/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.1168
Epoch 5/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0889
Epoch 6/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 0.0765
Epoch 7/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 0.0676
Epoch 8/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0555
Epoch 9/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 0.0489
Epoch 10/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - lo

