In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
import pickle

In [2]:
# Load and process the data
df = pd.read_csv('Updated_Common_Data_ASDA.csv')

# Convert date from 'YYYYMMDD' to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Extract features from date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.dayofweek

# Drop the original date column
df = df.drop(columns=['date','own_brand'])


In [3]:
df.head()

Unnamed: 0,supermarket,unit_price,unit,names,category,Morrisons_price,Tesco_price,Sains_price,year,month,day,weekday
0,ASDA,7.0,kg,Pukka Minced Beef & Onion Pie,fresh_food,9.3,2.0,7.0,2024,1,29,0
1,ASDA,7.1,l,Yakult Original,fresh_food,5.6,7.12,7.2,2024,1,29,0
2,ASDA,4.6,kg,Heinz Tomato Ketchup,food_cupboard,5.4,3.9,5.733333,2024,1,29,0
3,ASDA,15.88,kg,Paxo Sage & Onion Stuffing Mix,food_cupboard,15.88,12.95,13.283333,2024,1,29,0
4,ASDA,25.0,kg,Lindt Gold Bunny White Chocolate,food_cupboard,37.5,32.5,37.5,2024,1,29,0


In [5]:
df.duplicated().sum()

0

In [4]:
# Encode categorical features
label_encoders = {}
for column in ['supermarket', 'names','unit','category']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Function to create sequences for LSTM
def create_sequences(group, sequence_length):
    X, y = [], []
    for i in range(len(group) - sequence_length):
        X.append(group.iloc[i:i + sequence_length].drop(columns='unit_price').values)
        y.append(group.iloc[i + sequence_length]['unit_price'])
    return np.array(X), np.array(y)

sequence_length = 10
X_list, y_list = [], []

# Group by the product or other unique identifier column
for _, group in df.groupby('names'):
    X_group, y_group = create_sequences(group, sequence_length)
    X_list.append(X_group)
    y_list.append(y_group)

# Concatenate all groups
X = np.concatenate(X_list)
y = np.concatenate(y_list)

# Ensure correct data types
X = X.astype(np.float32)
y = y.astype(np.float32)

In [5]:

# Split the data into training and testing sets
train_X,test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure correct data types for train and test sets
train_X = train_X.astype(np.float32)
train_y = train_y.astype(np.float32)
X_test =test_X.astype(np.float32)
test_y = test_y.astype(np.float32)

# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Build the enhanced LSTM model with additional layers
model = Sequential()

# First LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.2))

# Second LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))

# Third LSTM layer with Dropout
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))

# Fourth LSTM layer with Dropout
model.add(LSTM(50, activation='relu'))
model.add(Dropout(0.2))

# First Dense layer
model.add(Dense(25, activation='relu'))

# Second Dense layer
model.add(Dense(10, activation='relu'))

# Output layer
model.add(Dense(1))

# Compile the model with the custom RMSE loss function
model.compile(optimizer='adam', loss=rmse)

# Train the model
model.fit(train_X, train_y, epochs=100, batch_size=64, validation_split=0.1)

# Evaluate the model
loss = model.evaluate(X_test, test_y)
print(f"Test RMSE: {loss}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [6]:
# Save the model
model.save('ASDA_lstm_model.h5')


In [8]:
# Load the model
model = load_model('ASDA_lstm_model.h5',compile=False)

In [9]:
import pickle

# Save label encoders
with open('ASDA_label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


In [32]:
# Load label encoders
with open('ASDA_label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)




In [11]:
model.predict(X_test)

array([[17.020727],
       [17.020727],
       [17.020727],
       ...,
       [17.020727],
       [17.020727],
       [17.020727]], dtype=float32)

Prediction: [[17.020727]]


In [26]:
# Converttest_X back to DataFrame
# Assuming the feature columns are the same as when creating sequences
feature_columns = ['supermarket', 'names', 'unit', 'category', 'Morrisons_price', 'Tesco_price', 'Sains_price', 'year', 'month', 'day', 'weekday']

# Create a DataFrame fromtest_X
X_test_reshaped =test_X.reshape(X_test.shape[0] *test_X.shape[1],test_X.shape[2])
X_test_df = pd.DataFrame(X_test_reshaped, columns=feature_columns)


X_test_df.to_csv('X_test.csv', index=False)


In [36]:
import numpy as np
import pandas as pd

# Function to prepare the latest sequence for prediction
def prepare_latest_sequence(df, product_name, sequence_length, label_encoders):
    # Filter the DataFrame for the specific product
    df_product = df[df['names'] == product_name]
    
    # Sort by year, month, day to get the latest data
    df_product = df_product.sort_values(by=['year', 'month', 'day'])
    latest_sequence = df_product.tail(sequence_length)
    
    # Check if the sequence is shorter than required and pad if necessary
    if len(latest_sequence) < sequence_length:
        padding = sequence_length - len(latest_sequence)
        latest_sequence = pd.concat([df_product] * (padding + 1), ignore_index=True).tail(sequence_length)
    

    for column, le in label_encoders.items():
        if column in latest_sequence.columns:
            latest_sequence[column] = le.transform(latest_sequence[column])
    
    return latest_sequence

# Function to predict unit price for a given number of days
def predict_prices(model, product_name, label_encoders, sequence_length, num_days):
    predictions = []
    
    # Prepare the latest sequence for the specific product
    latest_sequence_df = prepare_latest_sequence(df, product_name, sequence_length, label_encoders)
    

    X_latest = latest_sequence_df.drop(columns='unit_price').values
    X_latest = X_latest.reshape(1, sequence_length, X_latest.shape[1])
    X_latest = X_latest.astype(np.float32)
    
    for _ in range(num_days):
        # Predict the unit price for the next day
        next_day_price = model.predict(X_latest)
        predictions.append(next_day_price[0][0])
        
        # Create new sequence by appending the predicted price
        # Prepare the new sequence
        new_sequence = np.zeros((sequence_length, X_latest.shape[2]))
        new_sequence[0:-1] = X_latest[0, 1:, :]  # Shift existing values
        new_sequence[-1] = np.append(X_latest[0, -1, :-1], next_day_price[0][0])  # Append the new prediction
        
        X_latest = new_sequence.reshape(1, sequence_length, X_latest.shape[2])
    
    return predictions

# Example usage
product_name = 'Pukka Minced Beef & Onion Pie'
num_days = 8  # Number of days to predict
predicted_prices = predict_prices(model, product_name, label_encoders, sequence_length, num_days)
print(f"Predicted Unit Prices for the next {num_days} days for '{product_name}': {predicted_prices}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Predicted Unit Prices for the next 8 days for 'Pukka Minced Beef & Onion Pie': [17.020727, 17.020727, 17.020727, 17.020727, 17.020727, 17.020727, 17.020727, 17.020727]


In [44]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import pickle
from sklearn.preprocessing import LabelEncoder

# Load the demand dataset
df_demand = pd.read_csv('Demand_Dataset1.csv')
df= pd.read_csv("Updated_Common_Data_ASDA.csv")

# Load the model and label encoders
model = load_model('ASDA_lstm_model.h5', compile=False)

with open('ASDA_label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# Function to prepare the latest sequence for prediction
def prepare_latest_sequence(df, product_name, sequence_length, label_encoders):
    df_product = df[df['product'] == product_name]
    df_product = df_product.sort_values(by=['year', 'month', 'day'])
    latest_sequence = df_product.tail(sequence_length)
    
    if len(latest_sequence) < sequence_length:
        padding = sequence_length - len(latest_sequence)
        latest_sequence = pd.concat([df_product] * (padding + 1), ignore_index=True).tail(sequence_length)
    
 
    for column, le in label_encoders.items():
        if column in latest_sequence.columns:
            if set(latest_sequence[column].unique()).difference(set(le.classes_)):
                # Handle unseen labels
                new_classes = list(set(latest_sequence[column].unique()).difference(set(le.classes_)))
                le.classes_ = np.append(le.classes_, new_classes)
            latest_sequence[column] = le.transform(latest_sequence[column])
    
    # Ensure all columns are present
    all_features = list(label_encoders.keys()) + ['unit_price']
    for feature in all_features:
        if feature not in latest_sequence.columns:
            latest_sequence[feature] = 0  # Fill missing columns with zeros
    
    # Reorder columns to match model input
    latest_sequence = latest_sequence[all_features]
    

    X_latest = latest_sequence.drop(columns=['unit_price']).select_dtypes(include=[np.number])
    
    # Pad feature columns if there are fewer than expected features
    expected_features = 11
    if X_latest.shape[1] < expected_features:
        padding = expected_features - X_latest.shape[1]
        X_latest = np.pad(X_latest.values, ((0, 0), (0, padding)), mode='constant')
    elif X_latest.shape[1] > expected_features:
        X_latest = X_latest.iloc[:, :expected_features]
    
    return X_latest

# Function to predict unit price for a given number of days
def predict_prices(model, product_name, label_encoders, sequence_length, num_days):
    predictions = []
    latest_sequence_df = prepare_latest_sequence(df_demand, product_name, sequence_length, label_encoders)
    
    # Ensure data is in correct shape and type
    X_latest = latest_sequence_df.reshape(1, sequence_length, latest_sequence_df.shape[1])
    X_latest = X_latest.astype(np.float32)
    
    for _ in range(num_days):
        next_day_price = model.predict(X_latest)
        predictions.append(next_day_price[0][0])
        
        new_sequence = np.zeros((sequence_length, X_latest.shape[2]))
        new_sequence[0:-1] = X_latest[0, 1:, :]
        new_sequence[-1] = np.append(X_latest[0, -1, :-1], next_day_price[0][0])
        X_latest = new_sequence.reshape(1, sequence_length, X_latest.shape[2])
    
    return predictions

# Function to calculate dynamic price
def calculate_dynamic_price(predicted_price, df_demand, product_name, weekday):
    product_demand = df_demand[(df_demand['product'] == product_name) & (df_demand['weekday'] == weekday)]
    asda_sales = product_demand[product_demand['supermarket'] == 'ASDA']['total_sale'].sum()
    competitor_sales = product_demand[product_demand['supermarket'] != 'ASDA'].groupby('supermarket')['total_sale'].sum().sum()
    
    if asda_sales > 0:
        price_adjustment_factor = competitor_sales / (asda_sales + 1)
        dynamic_price = predicted_price * (1 + price_adjustment_factor)
    else:
        dynamic_price = predicted_price
    
    return dynamic_price

# Example usage
product_name = 'Pukka Minced Beef & Onion Pie'
sequence_length = 10
num_days = 1
weekday = 5  # Example: Tuesday

predicted_prices = predict_prices(model, product_name, label_encoders, sequence_length, num_days)
dynamic_prices = [calculate_dynamic_price(price, df_demand, product_name, weekday) for price in predicted_prices]

print(f"Predicted Unit Prices: {predicted_prices}")
print(f"Dynamic Prices: {dynamic_prices}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Predicted Unit Prices: [17.028584]
Dynamic Prices: [51.085750579833984]
