<h2>Sales Forecasting<h2>

<h3> Data Science <h3>

<h4>
    
a.Import the datasets into the Python environment
    
b.Examine the dataset's shape and structure, and look out for any outlier
    
c.Merge the datasets into a single dataset that includes the date, item id, price, item count, item names, kcal values, store id, and store name<h4>

In [3]:
# Load the datasets
import pandas as pd
items_df = pd.read_csv('items.csv')
sales_df = pd.read_csv('sales.csv')
restaurants_df = pd.read_csv('resturants.csv')

In [4]:
sales_df.head()

In [5]:
restaurants_df.head()

In [6]:
items_df.head()

In [7]:
# Merge sales with items on the item name
merged_sales_items = pd.merge(sales_df, items_df, left_on='item_id', right_on='id')

In [8]:
# Merge the result with restaurants on the store ID
final_merged_df = pd.merge(merged_sales_items, restaurants_df, left_on='store_id', right_on='id')

In [9]:
# Select the relevant columns for the final merged dataset
final_dataset = final_merged_df[['date', 'id_x', 'price', 'item_count', 'name_x', 'kcal', 'store_id', 'name_y']]

# Rename columns for clarity
final_dataset.columns = ['date', 'item_id', 'price', 'item_count', 'item_name', 'kcal', 'store_id', 'store_name']

# Display the final dataset
final_dataset.head()

<h3>Exploratory data analysis:<h3>

<h4>a.Examine the overall date wise sales to understand the pattern<h4>

In [21]:
import matplotlib.pyplot as plt
# Convert the 'date' column to datetime format
final_dataset['date'] = pd.to_datetime(sales_df['date'])
# Aggregate total item counts and total sales (revenue) by date
datewise_sales = final_dataset.groupby('date').agg(
    total_items_sold=('item_count', 'sum'),
    total_sales=('price', 'sum')
).reset_index()

# Display the aggregated data
print(datewise_sales)

# Plot total items sold over time
plt.figure(figsize=(10, 6))
plt.plot(datewise_sales['date'], datewise_sales['total_items_sold'], marker='o', label='Total Items Sold')
plt.title('Date-wise Total Items Sold')
plt.xlabel('Date')
plt.ylabel('Total Items Sold')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

# Plot total sales (revenue) over time
plt.figure(figsize=(10, 6))
plt.plot(datewise_sales['date'], datewise_sales['total_sales'], marker='o', color='orange', label='Total Sales (Revenue)')
plt.title('Date-wise Total Sales')
plt.xlabel('Date')
plt.ylabel('Total Sales (Revenue)')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

<h4>b.Find out how sales fluctuate across different days of the week<h4>

In [22]:

# Extract the day of the week from the 'date' column (0=Monday, 6=Sunday)
final_dataset['day_of_week'] = final_dataset['date'].dt.dayofweek

# Map the day of the week to their names
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
final_dataset['day_name'] = sales_df['day_of_week'].map(day_names)

# Aggregate total item counts and total sales by day of the week
sales_by_day = final_dataset.groupby('day_name').agg(
    total_items_sold=('item_count', 'sum'),
    total_sales=('price', 'sum')
).reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']).reset_index()

# Display the aggregated data
print(sales_by_day)

# Plot total items sold by day of the week
plt.figure(figsize=(10, 6))
plt.bar(sales_by_day['day_name'], sales_by_day['total_items_sold'], color='skyblue')
plt.title('Total Items Sold by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Items Sold')
plt.grid(axis='y')
plt.show()

# Plot total sales (revenue) by day of the week
plt.figure(figsize=(10, 6))
plt.bar(sales_by_day['day_name'], sales_by_day['total_sales'], color='orange')
plt.title('Total Sales (Revenue) by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Sales (Revenue)')
plt.grid(axis='y')
plt.show()

<h4>c.Look for any noticeable trends in the sales data for different months of the year<h4>

In [23]:
# Extract year and month
final_dataset['year'] = final_dataset['date'].dt.year
final_dataset['month'] = final_dataset['date'].dt.month

# Aggregate sales data by year and month
monthly_sales = final_dataset.groupby(['year', 'month']).agg(
    total_sales=('price', 'sum'),
    total_item_count=('item_count', 'sum')
).reset_index()

# Plot total sales per month
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'].astype(str) + '-' + monthly_sales['year'].astype(str), 
         monthly_sales['total_sales'], marker='o', linestyle='-')
plt.xticks(rotation=90)
plt.xlabel('Month-Year')
plt.ylabel('Total Sales')
plt.title('Total Sales per Month')
plt.grid(True)
plt.show()

# Plot total item count per month
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'].astype(str) + '-' + monthly_sales['year'].astype(str), 
         monthly_sales['total_item_count'], marker='o', linestyle='-')
plt.xticks(rotation=90)
plt.xlabel('Month-Year')
plt.ylabel('Total Item Count')
plt.title('Total Item Count per Month')
plt.grid(True)
plt.show()

<h4>d.Examine the sales distribution across different quarters averaged over the years. Identify any noticeable patterns.<h4>

In [24]:
final_dataset['quarter'] = final_dataset['date'].dt.to_period('Q')

# Aggregate sales data by quarter
quarterly_sales = final_dataset.groupby(['year', 'quarter']).agg(
    total_sales=('price', 'sum'),
    total_item_count=('item_count', 'sum')
).reset_index()

# Calculate average sales and item count per quarter across years
quarterly_avg_sales = quarterly_sales.groupby('quarter').agg(
    avg_sales=('total_sales', 'mean'),
    avg_item_count=('total_item_count', 'mean')
).reset_index()

# Plot average sales per quarter
plt.figure(figsize=(12, 6))
quarters = quarterly_avg_sales['quarter'].astype(str)
plt.plot(quarters, quarterly_avg_sales['avg_sales'], marker='o', linestyle='-')
plt.xticks(rotation=45)
plt.xlabel('Quarter')
plt.ylabel('Average Sales')
plt.title('Average Sales per Quarter')
plt.grid(True)
plt.show()

# Plot average item count per quarter
plt.figure(figsize=(12, 6))
plt.plot(quarters, quarterly_avg_sales['avg_item_count'], marker='o', linestyle='-')
plt.xticks(rotation=45)
plt.xlabel('Quarter')
plt.ylabel('Average Item Count')
plt.title('Average Item Count per Quarter')
plt.grid(True)
plt.show()

<h4>e.Compare the performances of the different restaurants. Find out which restaurant had the most sales and look at the sales for each restaurant across different years, months, and days.<h4>

In [25]:
# Aggregate total sales by restaurant
restaurant_sales = final_dataset.groupby('store_name').agg(
    total_sales=('price', 'sum'),
    total_item_count=('item_count', 'sum')
).reset_index()

# Find the restaurant with the most sales
top_restaurant = restaurant_sales.loc[restaurant_sales['total_sales'].idxmax()]

print(f"Restaurant with the most sales: {top_restaurant['store_name']}")
print(f"Total Sales: ${top_restaurant['total_sales']:.2f}")

In [26]:
# Extract year, month, and quarter
import matplotlib.pyplot as plt

# Convert the 'date' column to datetime format
final_dataset['date'] = pd.to_datetime(final_dataset['date'])
final_dataset['year'] = final_dataset['date'].dt.year
final_dataset['month'] = final_dataset['date'].dt.month
final_dataset['quarter'] = final_dataset['date'].dt.to_period('Q')


In [27]:
# Aggregate sales data by restaurant and year
yearly_sales = final_dataset.groupby(['store_name', 'year']).agg(
    total_sales=('price', 'sum')
).reset_index()

# Aggregate sales data by restaurant and month
monthly_sales = final_dataset.groupby(['store_name', 'year', 'month']).agg(
    total_sales=('price', 'sum')
).reset_index()

# Aggregate sales data by restaurant and day
daily_sales = final_dataset.groupby(['store_name', 'date']).agg(
    total_sales=('price', 'sum')
).reset_index()

In [28]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'final_merged_df' is your merged dataset

# Convert 'date' column to datetime
final_dataset['date'] = pd.to_datetime(final_dataset['date'])

# Extract month from the date
final_dataset['month'] = final_dataset['date'].dt.month

# Group by month and sum the item_count
monthly_sales = final_dataset.groupby('month')['item_count'].sum()

# Plot the trends
plt.figure(figsize=(10,6))
monthly_sales.plot(kind='bar', color='skyblue')
plt.title('Total Item Sales by Month')
plt.xlabel('Month')
plt.ylabel('Total Items Sold')
plt.xticks(rotation=0)
plt.show()

In [29]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'date' column to datetime if not done yet
final_dataset['date'] = pd.to_datetime(final_dataset['date'])

# Extract quarter from the date
final_dataset['quarter'] = final_dataset['date'].dt.quarter

# Group by quarter and calculate average item_count
quarterly_sales = final_dataset.groupby('quarter')['item_count'].mean()

# Plot the sales distribution across quarters
plt.figure(figsize=(10,6))
quarterly_sales.plot(kind='bar', color='orange')
plt.title('Average Item Sales by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Average Items Sold')
plt.xticks(rotation=0)
plt.show()

<h4>f.
Identify the most popular items overall and the stores where they are being sold. Also, find out the most popular item at each store<h4>

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Identify the most popular items overall
# Group by 'name' (item name) and sum up the 'item_count' to get total sales for each item
most_popular_items = final_dataset.groupby('item_name')['item_count'].sum().reset_index()

# Sort items by total sales in descending order to find the most popular items
most_popular_items = most_popular_items.sort_values(by='item_count', ascending=False)

print("Most Popular Items Overall:")
print(most_popular_items.head())  # Display the top 5 popular items

# Step 2: Identify the stores where the most popular items are being sold
# Group by 'name' and 'store_name' to find the total sales of each item in each store
item_sales_by_store = final_dataset.groupby(['item_name', 'store_name'])['item_count'].sum().reset_index()

# Merge with the most popular items to find the stores where these items are being sold
top_item_stores = pd.merge(most_popular_items, item_sales_by_store, on='item_name')

print("Stores Selling the Most Popular Items:")
print(top_item_stores.head())  # Display top results

# Step 3: Find the most popular item at each store
# Group by 'store_name' and 'name', then sum 'item_count' to find total sales for each item at each store
store_item_sales = final_dataset.groupby(['store_name', 'item_name'])['item_count'].sum().reset_index()

# Sort by 'store_name' and 'item_count' to find the most popular item for each store
store_item_sales = store_item_sales.sort_values(by=['store_name', 'item_count'], ascending=[True, False])

# Drop duplicates to keep only the most popular item per store
most_popular_item_per_store = store_item_sales.drop_duplicates(subset=['store_name'], keep='first')

print("Most Popular Item at Each Store:")
print(most_popular_item_per_store)

In [35]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the top 10 most popular items
plt.figure(figsize=(10, 6))
sns.barplot(data=most_popular_items.head(10), x='item_count', y='item_name', palette='viridis')
plt.title('Top 10 Most Popular Items (Overall)', fontsize=16)
plt.xlabel('Total Sales (Item Count)', fontsize=12)
plt.ylabel('Item Name', fontsize=12)
plt.show()

In [40]:
# Plot total sales of the top 5 items in each store
top_item_stores = top_item_stores.groupby('store_name').head(5)
plt.figure(figsize=(12, 6))
sns.barplot(data=top_item_stores, x='store_name', y='item_count_x', hue='item_name', palette='coolwarm')
plt.title('Total Sales of Most Popular Items by Store', fontsize=16)
plt.xlabel('Store Name', fontsize=12)
plt.ylabel('Total Sales (Item Count)', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Item Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [38]:
# Plot the most popular item in each store
plt.figure(figsize=(12, 6))
sns.barplot(data=most_popular_item_per_store, x='store_name', y='item_count', hue='item_name', palette='magma')
plt.title('Most Popular Item in Each Store', fontsize=16)
plt.xlabel('Store Name', fontsize=12)
plt.ylabel('Total Sales (Item Count)', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Item Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [43]:
# Prepare data for pie chart (top 5 most popular items)
top_5_items = most_popular_items.head(5)

plt.figure(figsize=(6, 6))
plt.pie(top_5_items['item_count'], labels=top_5_items['item_name'], autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2'))
plt.title('Sales Distribution of Top 5 Most Popular Items', fontsize=16)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [51]:
# Create pivot table for heatmap
heatmap_data = item_sales_by_store.pivot('store_name', 'item_name', 'item_count')

plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='.0f', linewidths=0.5)
plt.title('Heatmap of Item Sales by Store', fontsize=16)
plt.xlabel('Item Name', fontsize=12)
plt.ylabel('Store Name', fontsize=12)
plt.show()

In [46]:
import pandas as pd

# Group by item name to calculate total sales (item_count) for each item
popular_items = final_dataset.groupby('item_name')['item_count'].sum().sort_values(ascending=False)

# Find the most popular item overall
most_popular_item = popular_items.idxmax()
most_popular_item_sales = popular_items.max()

print(f"The most popular item overall is: {most_popular_item} with {most_popular_item_sales} total items sold.")

# Find which stores sell the most popular items
stores_with_popular_items = final_dataset[final_dataset['item_name'] == most_popular_item]['store_name'].unique()
print(f"Stores selling the most popular item ({most_popular_item}): {stores_with_popular_items}")

# Now, to find the most popular item at each store
popular_items_by_store = final_dataset.groupby(['item_name'])['item_count'].sum()

# Reset the index for easier manipulation
popular_items_by_store = popular_items_by_store.reset_index()

# Find the most popular item for each store
most_popular_item_by_store = popular_items_by_store.loc[popular_items_by_store.groupby('item_name')['item_count'].idxmax()]

print("Most popular item at each store:")
print(most_popular_item_by_store)

# Optional: If you want to plot the most popular items overall
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
popular_items.head(10).plot(kind='bar', color='green')
plt.title('Top 10 Most Popular Items')
plt.xlabel('Item')
plt.ylabel('Total Items Sold')
plt.xticks(rotation=45)
plt.show()


<h4>g.Determine if the store with the highest sales volume is also making the most money per day<h4>

In [47]:
import pandas as pd

# Convert 'date' column to datetime if not done already
final_dataset['date'] = pd.to_datetime(final_dataset['date'])

# Step 1: Calculate total sales volume for each store
store_sales_volume = final_dataset.groupby('store_name')['item_count'].sum()

# Step 2: Find the store with the highest sales volume
top_store_by_sales_volume = store_sales_volume.idxmax()
top_store_sales_volume = store_sales_volume.max()

print(f"The store with the highest sales volume is: {top_store_by_sales_volume} with {top_store_sales_volume} total items sold.")

# Step 3: Calculate daily revenue for each store (price * item_count)
final_dataset['daily_revenue'] = final_dataset['price'] * final_dataset['item_count']

# Step 4: Group by store and date to calculate total daily revenue
daily_revenue_by_store = final_dataset.groupby(['store_name', 'date'])['daily_revenue'].sum()

# Step 5: Calculate the average daily revenue for each store
average_daily_revenue_by_store = daily_revenue_by_store.groupby('store_name').mean()

# Find the store with the highest average daily revenue
top_store_by_daily_revenue = average_daily_revenue_by_store.idxmax()
top_store_daily_revenue = average_daily_revenue_by_store.max()

print(f"The store with the highest average daily revenue is: {top_store_by_daily_revenue} with an average daily revenue of {top_store_daily_revenue:.2f}")

# Comparison
if top_store_by_sales_volume == top_store_by_daily_revenue:
    print(f"The store with the highest sales volume, {top_store_by_sales_volume}, also makes the most money per day.")
else:
    print(f"The store with the highest sales volume, {top_store_by_sales_volume}, does not make the most money per day. The store with the highest daily revenue is {top_store_by_daily_revenue}.")


<h4>h.Identify the most expensive item at each restaurant and find out its calorie count<h4>

In [52]:
# Step 1: Group by store_name and find the most expensive item at each store
most_expensive_items = final_dataset.groupby(['store_name', 'item_name']).agg({'price': 'max', 'kcal': 'first'}).reset_index()

# Step 2: Sort by 'store_name' and 'price' to get the most expensive item for each store
most_expensive_items = most_expensive_items.sort_values(by=['store_name', 'price'], ascending=[True, False])

# Step 3: Drop duplicates to keep only the most expensive item per store
most_expensive_item_per_store = most_expensive_items.drop_duplicates(subset=['store_name'], keep='first')

# Step 4: Display the results (most expensive item and its calorie count at each store)
print("Most Expensive Item at Each Restaurant (and its Calorie Count):")
print(most_expensive_item_per_store[['store_name', 'item_name', 'price', 'kcal']])


<h3> Machine Learning <h3>

<h4>a.
Build and compare linear regression, random forest, and XGBoost models for predictions<h4>

<h5>
    
•Generate necessary features for the development of these models, like day of the week, quarter of the year, month, year, day of the month and so on
    
•Use the data from the last six months as the testing data
    
•Compute the root mean square error (RMSE) values for each model to compare their performances
    
•Use the best-performing models to make a forecast for the next year<h5>

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Step 1: Feature Engineering
final_dataset['date'] = pd.to_datetime(final_dataset['date'])
final_dataset['year'] = final_dataset['date'].dt.year
final_dataset['month'] = final_dataset['date'].dt.month
final_dataset['day'] = final_dataset['date'].dt.day
final_dataset['day_of_week'] = final_dataset['date'].dt.dayofweek
final_dataset['quarter'] = final_dataset['date'].dt.quarter

# Target: total sales revenue
final_dataset['sales_revenue'] = final_dataset['price'] * final_dataset['item_count']

# Step 2: Prepare Training and Testing Data (last 6 months for testing)
train_data = final_dataset[final_dataset['date'] < final_dataset['date'].max() - pd.DateOffset(months=6)]
test_data = final_dataset[final_dataset['date'] >= final_dataset['date'].max() - pd.DateOffset(months=6)]

# Features for the models
features = ['year', 'month', 'day', 'day_of_week', 'quarter']
X_train = train_data[features]
y_train = train_data['sales_revenue']
X_test = test_data[features]
y_test = test_data['sales_revenue']

# Standardizing features (especially important for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Build Models
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lr = lin_reg.predict(X_test_scaled)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

# Random Forest
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

# XGBoost
xgb_reg = XGBRegressor(n_estimators=100, random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

# Step 4: Compare RMSE for each model
print(f"RMSE - Linear Regression: {rmse_lr}")
print(f"RMSE - Random Forest: {rmse_rf}")
print(f"RMSE - XGBoost: {rmse_xgb}")

# Step 5: Forecast for the Next Year using the best model (lowest RMSE)
# Use XGBoost if it has the lowest RMSE as an example
best_model = xgb_reg if rmse_xgb < rmse_lr and rmse_xgb < rmse_rf else (rf_reg if rmse_rf < rmse_lr else lin_reg)

# Prepare the next year's data for forecasting
future_dates = pd.date_range(start=final_dataset['date'].max() + pd.DateOffset(days=1), periods=365, freq='D')
future_df = pd.DataFrame({'date': future_dates})
future_df['year'] = future_df['date'].dt.year
future_df['month'] = future_df['date'].dt.month
future_df['day'] = future_df['date'].dt.day
future_df['day_of_week'] = future_df['date'].dt.dayofweek
future_df['quarter'] = future_df['date'].dt.quarter

X_future = future_df[features]
X_future_scaled = scaler.transform(X_future)  # Scale the future data as well

# Predict sales for the next year
future_sales_forecast = best_model.predict(X_future_scaled)

# Step 6: Add forecasted sales to the future dataframe
future_df['forecasted_sales_revenue'] = future_sales_forecast

# Display the forecasted sales for the next year
future_df.head()


<h3> Deep Learning <h3>

<h4>Forecasting using deep learning algorithms:<h4>

<h4>
    
    a.Use sales amount for predictions instead of item count
    
    b.Build a long short-term memory (LSTM) model for predictions
    
        •Define the train and test series
        •Generate synthetic data for the last 12 months
        •Build and train an LSTM model
        •Use the model to make predictions for the test data
    
    c.Calculate the mean absolute percentage error (MAPE) and comment on the model's performance
    
    d.Develop another model using the entire series for training, and use it to forecast for the next three months<h4>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Step 1: Prepare the Data (sales_revenue for predictions)
final_dataset['date'] = pd.to_datetime(final_dataset['date'])
final_dataset['sales_revenue'] = final_dataset['price'] * final_dataset['item_count']

# Step 2: Sort by date and use the 'sales_revenue' column for time series prediction
final_dataset = final_dataset[['date', 'sales_revenue']].sort_values(by='date')
final_dataset.set_index('date', inplace=True)

# Step 3: Split the data into train and test (last 12 months as test)
split_date = final_dataset.index.max() - pd.DateOffset(months=12)
train_data = final_dataset[final_dataset.index < split_date]
test_data = final_dataset[final_dataset.index >= split_date]

# Normalize the data for LSTM
scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = scaler.fit_transform(train_data)
test_scaled = scaler.transform(test_data)

# Step 4: Prepare the data for LSTM
def create_sequences(data, time_steps=30):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])
        y.append(data[i + time_steps])
    return np.array(X), np.array(y)

time_steps = 30  # Using last 30 days to predict the next day
X_train, y_train = create_sequences(train_scaled, time_steps)
X_test, y_test = create_sequences(test_scaled, time_steps)

# Reshape the data to be accepted by LSTM (samples, time steps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Step 5: Build the LSTM Model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(time_steps, 1)))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

# Step 6: Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32)

# Step 7: Make Predictions for the Test Data
y_pred_scaled = model.predict(X_test)

# Inverse the scaling to get the original scale
y_pred = scaler.inverse_transform(y_pred_scaled)
y_test_actual = scaler.inverse_transform(y_test)

# Step 8: Calculate MAPE
mape = mean_absolute_percentage_error(y_test_actual, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")

# Plot the actual vs predicted sales
plt.figure(figsize=(10,6))
plt.plot(test_data.index[time_steps:], y_test_actual, color='blue', label='Actual Sales')
plt.plot(test_data.index[time_steps:], y_pred, color='red', label='Predicted Sales')
plt.title('LSTM Model - Actual vs Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Sales Revenue')
plt.legend()
plt.show()

# Step 9: Train a model with the entire series for forecasting the next 3 months
# Combine the train and test data for this model
full_data_scaled = scaler.fit_transform(final_dataset)

# Recreate sequences for the full dataset
X_full, y_full = create_sequences(full_data_scaled, time_steps)
X_full = X_full.reshape((X_full.shape[0], X_full.shape[1], 1))

# Train the LSTM on the full dataset
model_full = Sequential()
model_full.add(LSTM(units=50, return_sequences=True, input_shape=(time_steps, 1)))
model_full.add(LSTM(units=50))
model_full.add(Dense(1))
model_full.compile(optimizer='adam', loss='mean_squared_error')

model_full.fit(X_full, y_full, epochs=50, batch_size=32)

# Step 10: Forecast for the next 3 months (90 days)
future_days = 90
X_input = full_data_scaled[-time_steps:]  # Use the last 30 days to predict the future
X_input = X_input.reshape((1, time_steps, 1))

forecasted_sales = []

for _ in range(future_days):
    prediction_scaled = model_full.predict(X_input)
    forecasted_sales.append(prediction_scaled[0, 0])
    
    # Update the input sequence with the new prediction
    X_input = np.append(X_input[:, 1:, :], [[prediction_scaled]], axis=1)

# Inverse scale the forecasted values
forecasted_sales = scaler.inverse_transform(np.array(forecasted_sales).reshape(-1, 1))

# Create a future date range for the next 3 months
future_dates = pd.date_range(start=final_dataset.index.max() + pd.DateOffset(days=1), periods=future_days)

# Plot the forecast
plt.figure(figsize=(10,6))
plt.plot(future_dates, forecasted_sales, color='green', label='Forecasted Sales for Next 3 Months')
plt.title('LSTM Forecast - Next 3 Months')
plt.xlabel('Date')
plt.ylabel('Sales Revenue')
plt.legend()
plt.show()
