In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
import pickle



In [2]:
# %%
# Load and process the data
df = pd.read_csv('Updated_Common_Data_Tesco.csv')

# Convert date from 'YYYYMMDD' to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Extract features from date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.dayofweek

# Drop the original date column
df = df.drop(columns=['date','own_brand'])



In [3]:
df.duplicated().sum()

0

In [3]:

# %%
df.head()


Unnamed: 0,supermarket,unit_price,unit,names,category,ASDA_price,Morrisons_price,Sains_price,year,month,day,weekday
0,Tesco,11.00125,kg,Birds Eye,frozen,10.163125,9.567213,9.89,2024,1,29,0
1,Tesco,11.00125,kg,Birds Eye,frozen,10.163125,9.567213,9.89,2024,1,29,0
2,Tesco,4.2,kg,Heinz Oxtail Soup,food_cupboard,4.2,4.2,4.3,2024,1,29,0
3,Tesco,10.5,l,Lea & Perrins Worcestershire Sauce,food_cupboard,11.3,11.3,10.15,2024,1,29,0
4,Tesco,3.9,kg,Heinz Tomato Ketchup,food_cupboard,4.8,5.666667,5.425,2024,1,29,0


In [4]:

# Encode categorical features
label_encoders = {}
for column in ['supermarket', 'names','unit','category']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Function to create sequences for LSTM
def create_sequences(group, sequence_length):
    X, y = [], []
    for i in range(len(group) - sequence_length):
        X.append(group.iloc[i:i + sequence_length].drop(columns='unit_price').values)
        y.append(group.iloc[i + sequence_length]['unit_price'])
    return np.array(X), np.array(y)

sequence_length = 10
X_list, y_list = [], []

# Group by the product or other unique identifier column
for _, group in df.groupby('names'):
    X_group, y_group = create_sequences(group, sequence_length)
    X_list.append(X_group)
    y_list.append(y_group)

X = np.concatenate(X_list)
y = np.concatenate(y_list)


X = X.astype(np.float32)
y = y.astype(np.float32)

In [5]:


# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure correct data types for train and test sets
train_X = train_X.astype(np.float32)
train_y = train_y.astype(np.float32)
test_X = test_X.astype(np.float32)
test_y = test_y.astype(np.float32)

# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Build the enhanced LSTM model with additional layers
model = Sequential()

# First LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.2))

# Second LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))

# Third LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))

# Fourth LSTM layer with Dropout
model.add(LSTM(50, activation='relu'))
model.add(Dropout(0.2))

# First Dense layer
model.add(Dense(25, activation='relu'))

# Second Dense layer
model.add(Dense(10, activation='relu'))

# Output layer
model.add(Dense(1))

# Compile the model with the custom RMSE loss function
model.compile(optimizer='adam', loss=rmse)

# Train the model
model.fit(train_X, train_y, epochs=100, batch_size=64, validation_split=0.1)

# Evaluate the model
loss = model.evaluate(test_X, test_y)
print(f"Test RMSE: {loss}")



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [6]:

# Save the model
model.save('Tesco_lstm_model1.h5')


# Load the model
model = load_model('Tesco_lstm_model1.h5',compile=False)


import pickle

# Save label encoders
with open('Tesco_label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


# Load label encoders
with open('Tesco_label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)



model.predict(test_X)


array([[10.621125],
       [20.739857],
       [20.996904],
       ...,
       [20.990881],
       [20.830196],
       [20.775818]], dtype=float32)