📄 Step 1: Load and preprocess the dataset


In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [35]:
# Load data
df = pd.read_csv(r'C:\Users\admin\Downloads\archive (4)\sales_data.csv')
print(df.head())


         Date Store ID Product ID     Category Region  Inventory Level  \
0  2022-01-01     S001      P0001  Electronics  North              195   
1  2022-01-01     S001      P0002     Clothing  North              117   
2  2022-01-01     S001      P0003     Clothing  North              247   
3  2022-01-01     S001      P0004  Electronics  North              139   
4  2022-01-01     S001      P0005    Groceries  North              152   

   Units Sold  Units Ordered  Price  Discount Weather Condition  Promotion  \
0         102            252  72.72         5             Snowy          0   
1         117            249  80.16        15             Snowy          1   
2         114            612  62.94        10             Snowy          1   
3          45            102  87.63        10             Snowy          0   
4          65            271  54.41         0             Snowy          0   

   Competitor Pricing Seasonality  Epidemic  Demand  
0               85.73      Winte

In [36]:
df

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Price,Discount,Weather Condition,Promotion,Competitor Pricing,Seasonality,Epidemic,Demand
0,2022-01-01,S001,P0001,Electronics,North,195,102,252,72.72,5,Snowy,0,85.73,Winter,0,115
1,2022-01-01,S001,P0002,Clothing,North,117,117,249,80.16,15,Snowy,1,92.02,Winter,0,229
2,2022-01-01,S001,P0003,Clothing,North,247,114,612,62.94,10,Snowy,1,60.08,Winter,0,157
3,2022-01-01,S001,P0004,Electronics,North,139,45,102,87.63,10,Snowy,0,85.19,Winter,0,52
4,2022-01-01,S001,P0005,Groceries,North,152,65,271,54.41,0,Snowy,0,51.63,Winter,0,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75995,2024-01-30,S005,P0016,Toys,North,233,63,0,29.80,5,Snowy,0,32.23,Winter,0,64
75996,2024-01-30,S005,P0017,Toys,North,137,115,141,42.92,5,Snowy,0,40.73,Winter,0,137
75997,2024-01-30,S005,P0018,Clothing,North,197,44,0,17.81,10,Snowy,0,19.41,Winter,0,68
75998,2024-01-30,S005,P0019,Furniture,North,125,58,0,151.72,0,Snowy,0,143.71,Winter,0,84


✅ Step 2: Basic Data Overview


In [37]:
import pandas as pd

df = pd.read_csv(r"C:\Users\admin\Downloads\archive (4)\sales_data.csv", parse_dates=['Date'])

# Basic Info
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.head())
print(df.describe())
print(df.isnull().sum())


(76000, 16)
Index(['Date', 'Store ID', 'Product ID', 'Category', 'Region',
       'Inventory Level', 'Units Sold', 'Units Ordered', 'Price', 'Discount',
       'Weather Condition', 'Promotion', 'Competitor Pricing', 'Seasonality',
       'Epidemic', 'Demand'],
      dtype='object')
Date                  datetime64[ns]
Store ID                      object
Product ID                    object
Category                      object
Region                        object
Inventory Level                int64
Units Sold                     int64
Units Ordered                  int64
Price                        float64
Discount                       int64
Weather Condition             object
Promotion                      int64
Competitor Pricing           float64
Seasonality                   object
Epidemic                       int64
Demand                         int64
dtype: object
        Date Store ID Product ID     Category Region  Inventory Level  \
0 2022-01-01     S001      P0001  Elec

📌 Step 3: Feature Engineering 

In [38]:
import numpy as np

# Sort by date for each Store-Product combo
df = df.sort_values(by=['Store ID', 'Product ID', 'Date'])

# Time-based features
df['day_of_week'] = df['Date'].dt.dayofweek
df['week_of_year'] = df['Date'].dt.isocalendar().week
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Lag Features (previous values of target)
for lag in [1, 7, 28]:
    df[f'units_sold_lag_{lag}'] = df.groupby(['Store ID', 'Product ID'])['Units Sold'].shift(lag)

# Rolling window statistics
for window in [7, 28]:
    df[f'units_sold_roll_mean_{window}'] = df.groupby(['Store ID', 'Product ID'])['Units Sold'].shift(1).rolling(window).mean()
    df[f'units_sold_roll_std_{window}'] = df.groupby(['Store ID', 'Product ID'])['Units Sold'].shift(1).rolling(window).std()

# Clean missing rows caused by lag/rolling
df = df.dropna().reset_index(drop=True)

print("✅ Feature engineering done. Final shape:", df.shape)


✅ Feature engineering done. Final shape: (73200, 28)


In [39]:
df

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Price,Discount,Weather Condition,Promotion,Competitor Pricing,Seasonality,Epidemic,Demand,day_of_week,week_of_year,month,year,is_weekend,units_sold_lag_1,units_sold_lag_7,units_sold_lag_28,units_sold_roll_mean_7,units_sold_roll_std_7,units_sold_roll_mean_28,units_sold_roll_std_28
0,2022-01-29,S001,P0001,Electronics,North,263,69,0,65.00,5,Sunny,0,77.11,Winter,0,81,5,4,1,2022,1,201.0,87.0,102.0,103.000000,57.317827,90.035714,38.311087
1,2022-01-30,S001,P0001,Electronics,North,537,73,0,71.59,0,Cloudy,0,69.69,Winter,0,79,6,4,1,2022,1,69.0,109.0,71.0,100.428571,58.545871,88.857143,38.436782
2,2022-01-31,S001,P0001,Electronics,North,464,103,0,72.10,10,Cloudy,0,70.23,Winter,0,113,0,5,1,2022,0,73.0,29.0,142.0,95.285714,59.244449,88.928571,38.404213
3,2022-02-01,S001,P0001,Electronics,North,361,102,0,70.53,5,Snowy,0,66.56,Winter,0,90,1,5,2,2022,0,103.0,55.0,42.0,105.857143,51.547482,87.535714,37.092958
4,2022-02-02,S001,P0001,Electronics,North,259,94,215,60.16,20,Cloudy,1,54.96,Winter,0,94,2,5,2,2022,0,102.0,147.0,129.0,112.571429,46.647105,89.678571,36.084319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73195,2024-01-26,S005,P0020,Toys,North,99,99,133,34.99,0,Cloudy,0,37.50,Winter,0,134,4,4,1,2024,0,65.0,55.0,8.0,83.714286,26.405266,62.178571,34.551147
73196,2024-01-27,S005,P0020,Toys,North,133,28,0,22.55,10,Snowy,0,26.95,Winter,0,38,5,4,1,2024,1,99.0,108.0,22.0,90.000000,23.508864,65.428571,33.531001
73197,2024-01-28,S005,P0020,Toys,North,105,83,122,30.87,15,Cloudy,1,28.08,Winter,0,130,6,4,1,2024,1,28.0,60.0,20.0,78.571429,31.415798,65.642857,33.261271
73198,2024-01-29,S005,P0020,Toys,North,144,112,94,31.95,5,Cloudy,0,31.33,Winter,0,105,0,5,1,2024,0,83.0,121.0,14.0,81.857143,30.333857,67.892857,32.172382


In [40]:
pd.set_option('display.max_columns', None)  # Show all columns
print(df.columns)

Index(['Date', 'Store ID', 'Product ID', 'Category', 'Region',
       'Inventory Level', 'Units Sold', 'Units Ordered', 'Price', 'Discount',
       'Weather Condition', 'Promotion', 'Competitor Pricing', 'Seasonality',
       'Epidemic', 'Demand', 'day_of_week', 'week_of_year', 'month', 'year',
       'is_weekend', 'units_sold_lag_1', 'units_sold_lag_7',
       'units_sold_lag_28', 'units_sold_roll_mean_7', 'units_sold_roll_std_7',
       'units_sold_roll_mean_28', 'units_sold_roll_std_28'],
      dtype='object')


✅ Step 3: Encoding, Scaling & Reshaping


✅ 1. Label Encoding (for categorical features)

In [41]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Store ID', 'Product ID', 'Category']  # adjust if more exist
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [42]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode
categorical_cols = ['Category', 'Region', 'Weather Condition',
                    'Promotion', 'Competitor Pricing', 'Seasonality',
                    'Epidemic', 'Demand']

# Apply Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # store encoders for inverse_transform later if needed

print("✅ All categorical columns encoded.")


✅ All categorical columns encoded.


✅ 2. Feature & Target Separation

In [43]:
# Drop columns not used as features
features = df.drop(columns=['Date', 'Units Sold'])  # Units Sold is the target
target = df['Units Sold']

✅ 3. Scaling the features

In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(features)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(target.values.reshape(-1, 1))


✅ 4. Reshape for Deep Learning (Supervised Framing for Multi-step Forecasting)


We’ll prepare the data for 7-day and 28-day forecasting using sliding windows.

In [45]:
import numpy as np

def create_sequences(X, y, window_size=30, forecast_horizon=7):
    Xs, ys = [], []
    for i in range(len(X) - window_size - forecast_horizon + 1):
        Xs.append(X[i: i + window_size])
        ys.append(y[i + window_size: i + window_size + forecast_horizon])
    return np.array(Xs), np.array(ys)

# Create sequences
window_size = 30
forecast_horizon = 7  # can change to 28 for long-range forecasting
X_seq, y_seq = create_sequences(X_scaled, y_scaled, window_size, forecast_horizon)

print("✅ Reshaped:")
print("X shape:", X_seq.shape)  # (samples, time steps, features)
print("y shape:", y_seq.shape)  # (samples, forecast_horizon)


✅ Reshaped:
X shape: (73164, 30, 26)
y shape: (73164, 7, 1)


🔁 Step 1: Train-Test Split for Time Series


We'll split chronologically since it's time-series data. We'll use 2022–2023 for training, and early 2024 for testing.



In [46]:
import pandas as pd
# Ensure Date column is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort chronologically
df = df.sort_values('Date')

# Split: Train = data before 2024, Test = 2024 data
train_df = df[df['Date'] < '2024-01-01']
test_df  = df[df['Date'] >= '2024-01-01']

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")


Train shape: (70200, 28), Test shape: (3000, 28)


 Step 2: Define Features and Target


In [47]:
target_col = 'Units Sold'
drop_cols = ['Date', target_col]

X_train = train_df.drop(columns=drop_cols)
y_train = train_df[target_col]

X_test = test_df.drop(columns=drop_cols)
y_test = test_df[target_col]


📏 Step 3: Scaling


We'll scale both features and target. LSTMs work better when data is normalized

In [48]:
from sklearn.preprocessing import MinMaxScaler

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

🔄 Step 4: Reshaping for LSTM

In [49]:
def create_sequences(X, y, window_size):
    Xs, ys = [], []
    for i in range(len(X) - window_size):
        Xs.append(X[i:i+window_size])
        ys.append(y[i+window_size])  # next value
    return np.array(Xs), np.array(ys)

WINDOW_SIZE = 7  # past 7 days
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, WINDOW_SIZE)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, WINDOW_SIZE)

print(f"Train X: {X_train_seq.shape}, Train y: {y_train_seq.shape}")
print(f"Test X: {X_test_seq.shape}, Test y: {y_test_seq.shape}")


Train X: (70193, 7, 26), Train y: (70193, 1)
Test X: (2993, 7, 26), Test y: (2993, 1)


 Step 1: Define the LSTM Model


In [60]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

timesteps = 7   # input: past 7 days
features = X_train_seq.shape[2]
target_steps = 7  # predict next 7 days

# Encoder
encoder_inputs = Input(shape=(timesteps, features))
encoder = LSTM(128, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = RepeatVector(target_steps)(state_h)
decoder_lstm = LSTM(128, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(1))
outputs = decoder_dense(decoder_outputs)

# Build model
seq2seq_model = Model(encoder_inputs, outputs)
seq2seq_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
seq2seq_model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 7, 26)]      0           []                               
                                                                                                  
 lstm_6 (LSTM)                  [(None, 128),        79360       ['input_1[0][0]']                
                                 (None, 128),                                                     
                                 (None, 128)]                                                     
                                                                                                  
 repeat_vector (RepeatVector)   (None, 7, 128)       0           ['lstm_6[0][1]']                 
                                                                                              

🚆 Step 2: Train the Model

In [61]:
history = model.fit(
    X_train_seq, y_train_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    shuffle=False,
    verbose=1
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


📈 Step 3: Make Predictions

In [62]:
y_pred_scaled = model.predict(X_test_seq)

# Rescale predictions and ground truth back to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_true = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1))




🧮 Step 4: Evaluation Metrics


In [63]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred_day1 = y_pred[:, 0]
y_true_day1 = y_true[:, 0]

mae = mean_absolute_error(y_true_day1, y_pred_day1)
rmse = mean_squared_error(y_true_day1, y_pred_day1, squared=False)

# Handle division by zero
mask = y_true_day1 != 0
mape = np.mean(np.abs((y_true_day1[mask] - y_pred_day1[mask]) / y_true_day1[mask])) * 100

print(f"✅ MAE: {mae:.2f}")
print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAPE: {mape:.2f} %")


✅ MAE: 31.73
✅ RMSE: 40.40
✅ MAPE: 66.77 %


In [64]:
df.columns

Index(['Date', 'Store ID', 'Product ID', 'Category', 'Region',
       'Inventory Level', 'Units Sold', 'Units Ordered', 'Price', 'Discount',
       'Weather Condition', 'Promotion', 'Competitor Pricing', 'Seasonality',
       'Epidemic', 'Demand', 'day_of_week', 'week_of_year', 'month', 'year',
       'is_weekend', 'units_sold_lag_1', 'units_sold_lag_7',
       'units_sold_lag_28', 'units_sold_roll_mean_7', 'units_sold_roll_std_7',
       'units_sold_roll_mean_28', 'units_sold_roll_std_28'],
      dtype='object')