In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import math
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

plt.style.use('fivethirtyeight')


In [None]:
df = pd.read_csv(r'c:\Users\HOME\Downloads\data.csv', encoding='unicode_escape')

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
# Check for missing values in each column
df.isnull().sum()

In [None]:
df.shape

In [None]:
# Data Preprocessing
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[(df_cleaned['Quantity'] > 0) & (df_cleaned['UnitPrice'] > 0)]
df_cleaned['InvoiceDate'] = pd.to_datetime(df_cleaned['InvoiceDate'])
df_cleaned['Sales'] = df_cleaned['Quantity'] * df_cleaned['UnitPrice']


In [None]:
# Count duplicate rows
duplicate_rows_count = df_cleaned.duplicated().sum()

print(f"The number of duplicate rows is: {duplicate_rows_count}")


In [None]:
# Droping duplicate values
df_cleaned.drop_duplicates(inplace=True)

In [None]:
df_cleaned[['Quantity', 'UnitPrice']].describe()

In [None]:
# Description Countplot
item_counts = df_cleaned['Description'].value_counts().sort_values(ascending=False).iloc[0:15]

plt.figure(figsize=(18, 6))
sns.barplot(x=item_counts.index, y=item_counts.values, palette='viridis')  # Change the color palette to 'viridis'
plt.ylabel("Counts")
plt.title("Which items were bought more often?")
plt.xticks(rotation=90)
plt.show()

In [None]:
df_cleaned['Description'].value_counts().nlargest(15)

In [None]:
df['Description'].value_counts().tail()

In [None]:
# Stock codes Count plot
stock_counts = df_cleaned['StockCode'].value_counts().sort_values(ascending=False).iloc[0:15]

plt.figure(figsize=(18, 6))
sns.barplot(x=stock_counts.index, y=stock_counts.values, palette='mako')  # Change the color palette to 'YlOrBr'
plt.ylabel("Counts")
plt.title("Which stock codes were used the most?")
plt.xticks(rotation=90)
plt.show()

In [None]:
df_cleaned['StockCode'].value_counts().nlargest(15)

In [None]:
# Calculate the quantity sold for each product
top_products = df_cleaned.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10).reset_index()

# Create a bar chart with separate colors using Matplotlib and Seaborn
plt.figure(figsize=(12, 6))
sns.barplot(data=top_products, x='Description', y='Quantity', palette='Set2')
plt.title('Top 10 Products by Quantity Sold')
plt.xlabel('Product Description')
plt.ylabel('Total Quantity Sold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Invoice with most number of items
inv_counts = df_cleaned['InvoiceNo'].value_counts().sort_values(ascending=False).iloc[0:15]

plt.figure(figsize=(18, 6))
sns.barplot(x=inv_counts.index, y=inv_counts.values, palette='crest')  
plt.ylabel("Counts")
plt.title("Which invoices had the most items?")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Group by Description and calculate total sales for each product
product_sales = df_cleaned.groupby('Description')['Sales'].sum().sort_values(ascending=False).head(10).reset_index()

# Bar chart with different colors for each bar using Matplotlib and Seaborn
plt.figure(figsize=(12, 6))
sns.barplot(data=product_sales, x='Description', y='Sales', palette='Set1')
plt.title('Top 10 Products by Sales')
plt.xlabel('Product Description')
plt.ylabel('Total Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df_cleaned[df_cleaned['Country'] != "United Kingdom"]['Country'].value_counts().nlargest(10)

In [None]:
# Group by Countries and calculate total sales for each Country(Excluding United Kingdom)
product_sales_country = df_cleaned.groupby('Country')['Sales'].sum().sort_values(ascending=False).head(10).reset_index()

# Bar chart 
plt.figure(figsize=(12, 6))
sns.barplot(data=product_sales_country[product_sales_country['Country'] != "United Kingdom"], x='Country', y='Sales', palette='Set1')
plt.title("Distribution of Sales over the top 10 countries by sales outside the UK")
plt.xlabel('Countries')
plt.ylabel('Total Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
print("Total Sales for United Kingdom is",df_cleaned[df_cleaned["Country"] == 'United Kingdom']['Sales'].sum().round(2))
print("Total Sales for Other is",df_cleaned[df_cleaned["Country"] != 'United Kingdom']['Sales'].sum().round(2))

In [None]:
# Monthly sales trend
df1 = df_cleaned.copy()
df1['InvoiceDate'] = pd.to_datetime(df1['InvoiceDate'])
df1['Month'] = df1['InvoiceDate'].dt.to_period('M')
monthly_sales = df1.groupby('Month')['Quantity'].sum()
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Quantity Sold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Function for outlier removal based on IQR
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    df_outlier_free = df[(df[column_name] >= Q1 - 1.5 * IQR) & (df[column_name] <= Q3 + 1.5 * IQR)]
    return df_outlier_free
# Remove outliers for 'UnitPrice' and 'Quantity'
df_outlier_free_unit_price = remove_outliers(df_cleaned, 'UnitPrice')
df_cleaned = remove_outliers(df_outlier_free_unit_price, 'Quantity')

In [None]:
# Plot histograms
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_outlier_free_unit_price['UnitPrice'], bins=10, color='blue', alpha=0.7)
plt.title('Histogram of UnitPrice')
plt.xlabel('UnitPrice')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(df_cleaned['Quantity'], bins=10, color='green', alpha=0.7)
plt.title('Histogram of Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()



In [None]:
# Aggregate Data by Date
df_aggregated = df_cleaned.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({'Sales': 'sum'}).reset_index()

In [None]:

# Basic statistics
print("Basic Statistics:")
print(df_aggregated['Sales'].describe())

# Check for missing values
print("\nMissing Values:")
print(df_aggregated.isnull().sum())

df_filtered = df_aggregated[df_aggregated['InvoiceDate'] > '2010-12-31']

# Time series plot
plt.figure(figsize=(10, 4))
plt.plot(df_filtered['InvoiceDate'], df_filtered['Sales'])
plt.title('Time Series Plot of Daily Sales')
plt.xlabel('Invoice Date')
plt.ylabel('Total Sales')
plt.show()

# Boxplot for seasonality and outliers
plt.figure(figsize=(10, 4))
sns.boxplot(df_filtered['InvoiceDate'].dt.month, df_filtered['Sales'])
plt.title('Monthly Sales Distribution')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

# Autocorrelation plot
plt.figure(figsize=(10, 4))
autocorrelation_plot(df_filtered['Sales'])
plt.title('Autocorrelation Plot')
plt.show()

In [None]:
# Histogram
plt.figure(figsize=(10, 4))
sns.histplot(df_filtered['Sales'], bins=30)
plt.title('Histogram of Daily Sales')
plt.xlabel('Total Price')
plt.ylabel('Frequency')
plt.show()

# Moving Average
df_filtered['Moving_Avg'] = df_filtered['Sales'].rolling(window=7).mean()
plt.figure(figsize=(10, 4))
plt.plot(df_filtered['InvoiceDate'], df_filtered['Moving_Avg'])
plt.title('7-Day Moving Average of Daily Sales')
plt.xlabel('Invoice Date')
plt.ylabel('Total Price (7-Day Avg)')
plt.show()

from statsmodels.tsa.seasonal import seasonal_decompose
# Seasonal Decomposition
result = seasonal_decompose(df_filtered['Sales'].fillna(0), period=30)  # Monthly seasonality
result.plot()
plt.show()

# Heatmap
df_filtered['Year'] = df_filtered['InvoiceDate'].dt.year
df_filtered['Month'] = df_filtered['InvoiceDate'].dt.month
pivot_table = df_filtered.pivot_table(values='Sales', index='Month', columns='Year', aggfunc='sum')
sns.heatmap(pivot_table, annot=True, fmt=".1f")
plt.title('Monthly Sales Heatmap')
plt.xlabel('Year')
plt.ylabel('Month')
plt.show()

from pandas.plotting import lag_plot
# Lag Scatter Plot
lag_plot(df_filtered['Sales'])
plt.title('Lag Scatter Plot')
plt.show()

## LSTM


In [None]:
# Function to build and evaluate the model
def build_and_evaluate_model(look_back, split_ratio):
    # Feature Scaling
    scaler = MinMaxScaler()
    df_aggregated['Sales_scaled'] = scaler.fit_transform(df_aggregated[['Sales']])

    # Define look_forward
    look_forward = 7  # Number of future time steps to forecast

    # Data Sequencing
    X, Y = [], []
    for i in range(len(df_aggregated) - look_back - look_forward + 1):
        X.append(df_aggregated['Sales_scaled'][i:i+look_back].values)
        Y.append(df_aggregated['Sales_scaled'][i+look_back:i+look_back+look_forward].values)

    X, Y = np.array(X), np.array(Y)

    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Reshape input to be [samples, time steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

    # Model Building
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(look_forward))
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Model Training
    model.fit(X_train, Y_train, epochs=50, batch_size=1, verbose=1)

    # Prediction
    Y_pred = model.predict(X_test)

    # Inverse scaling
    Y_test_inv = scaler.inverse_transform(Y_test)
    Y_pred_inv = scaler.inverse_transform(Y_pred)

    # Evaluation
    mae = mean_absolute_error(Y_test_inv, Y_pred_inv)
    mse = mean_squared_error(Y_test_inv, Y_pred_inv)
    rmse = math.sqrt(mse)
    r2 = r2_score(Y_test_inv, Y_pred_inv)

    plt.figure(figsize=(12, 6))
    plt.plot(Y_test_inv[0], label='Actual', marker='o')
    plt.plot(Y_pred_inv[0], label='Predicted', marker='x')
    plt.title('Actual vs Predicted Sales')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()

    # Forecasting future sales using last known sequence
    last_known_seq = X[-1:]
    last_known_seq = np.reshape(last_known_seq, (last_known_seq.shape[0], 1, last_known_seq.shape[1]))
    future_sales_scaled = model.predict(last_known_seq)
    future_sales = scaler.inverse_transform(future_sales_scaled)
    
    # Plotting the forecasted sales
    plt.figure(figsize=(12, 6))
    plt.plot(future_sales[0], label='Forecasted Sales', marker='s')
    plt.title('Forecasted Sales for Next ' + str(look_forward) + ' Days')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()
    
    return mae, mse, rmse, r2

# Different scalers, look_backs, and split_ratios to try
look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

# To store the best results
best_mae = float('inf')
best_config = {}

# Loop through all combinations
for look_back in look_backs:
    for split_ratio in split_ratios:
        mae, mse, rmse, r2 = build_and_evaluate_model(look_back, split_ratio)
        print(f"Look Back: {look_back}, Split Ratio: {1 - split_ratio}-{split_ratio*100}, MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R2: {r2}")
        
        if mae < best_mae:
            best_mae = mae
            best_config = {'look_back': look_back, 'split_ratio': split_ratio, 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

print("\nBest Configuration:")
print(best_config)


## Random Forest, Linear Regression and Support Vector Regressor

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Define look_forward and look_back
look_forward = 7  # Number of future time steps to forecast

# Different algorithms to try
algorithms = {'LinearRegression': LinearRegression(), 
              'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
              'SVR': SVR()}

# Different look_backs and split_ratios to try
look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

# To store the best results
best_mae = float('inf')
best_config = {}

# Loop through all combinations
for algo_name, algo in algorithms.items():
    for look_back in look_backs:
        for split_ratio in split_ratios:
            # Data Sequencing
            X, Y = [], []
            for i in range(len(df_aggregated) - look_back - look_forward + 1):
                X.append(df_aggregated['Sales'][i:i+look_back].values)
                Y.append(df_aggregated['Sales'][i+look_back:i+look_back+look_forward].values)
            X, Y = np.array(X), np.array(Y)
            
            # Train-Test Split
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split_ratio, random_state=42)

            # Model Building and Training
            model = MultiOutputRegressor(algo)
            model.fit(X_train, Y_train)

            # Prediction
            Y_pred = model.predict(X_test)

            # Evaluation
            mae = mean_absolute_error(Y_test, Y_pred)
            r2 = r2_score(Y_test, Y_pred, multioutput='variance_weighted')
            
            print(f"Algorithm: {algo_name}, Look Back: {look_back}, Split Ratio: {1 - split_ratio}-{split_ratio*100}, MAE: {mae}, R2: {r2}")
            
            if mae < best_mae:
                best_mae = mae
                best_config = {'algorithm': algo_name, 'look_back': look_back, 'split_ratio': split_ratio, 'MAE': mae, 'R2': r2}

print("\nBest Configuration:")
print(best_config)


## CNN

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import matplotlib.pyplot as plt
import numpy as np
import math

def build_and_evaluate_cnn_model(df_aggregated, look_back, split_ratio):
    # Feature Scaling
    scaler = MinMaxScaler()
    df_aggregated['Sales_scaled'] = scaler.fit_transform(df_aggregated[['Sales']])
    
    # Define look_forward
    look_forward = 7  # Number of future time steps to forecast
    
    # Data Sequencing
    X, Y = [], []
    for i in range(len(df_aggregated) - look_back - look_forward + 1):
        X.append(df_aggregated['Sales_scaled'][i:i+look_back].values)
        Y.append(df_aggregated['Sales_scaled'][i+look_back:i+look_back+look_forward].values)
    
    X, Y = np.array(X), np.array(Y)
    
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split_ratio, random_state=42)
    
    # Reshape input to be [samples, time steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    
    # Model Building
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(look_forward))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Model Training
    model.fit(X_train, Y_train, epochs=50, batch_size=1, verbose=1)
    
    # Prediction
    Y_pred = model.predict(X_test)
    
    # Inverse scaling
    Y_test_inv = scaler.inverse_transform(Y_test)
    Y_pred_inv = scaler.inverse_transform(Y_pred)
    
    # Evaluation
    metrics = {}
    metrics['MAE'] = mean_absolute_error(Y_test_inv, Y_pred_inv)
    metrics['MSE'] = mean_squared_error(Y_test_inv, Y_pred_inv)
    metrics['RMSE'] = math.sqrt(metrics['MSE'])
    metrics['R2'] = r2_score(Y_test_inv, Y_pred_inv)
    
    plt.figure(figsize=(12, 6))
    plt.plot(Y_test_inv[0], label='Actual', marker='o')
    plt.plot(Y_pred_inv[0], label='Predicted', marker='x')
    plt.title('Actual vs Predicted Sales')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()

    # Debugging
    last_known_seq = X[-1:]
    
    last_known_seq = np.reshape(last_known_seq, (last_known_seq.shape[0], last_known_seq.shape[1], 1))
    
    future_sales_scaled = model.predict(last_known_seq)
    future_sales = scaler.inverse_transform(future_sales_scaled)
    
    # Plotting the forecasted sales
    plt.figure(figsize=(12, 6))
    plt.plot(future_sales[0], label='Forecasted Sales', marker='s')
    plt.title('Forecasted Sales for Next ' + str(look_forward) + ' Days')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()
    
    return metrics


# Different split_ratios to try
look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

# To store the best results
best_metrics = {'MAE': float('inf'), 'MSE': float('inf'), 'RMSE': float('inf'), 'R2': float('-inf')}
best_config = {}

# Loop through all combinations
for look_back in look_backs:
    for split_ratio in split_ratios:
        metrics = build_and_evaluate_cnn_model(df_aggregated, look_back, split_ratio)
        
        print(f"Look Back: {look_back}, Split Ratio: {1 - split_ratio}-{split_ratio*100}, Metrics: {metrics}")
        
        if metrics['MAE'] < best_metrics['MAE'] and metrics['MSE'] < best_metrics['MSE'] and metrics['R2'] > best_metrics['R2']:
            best_metrics = metrics
            best_config = {'look_back': look_back, 'split_ratio': split_ratio}

print("\nBest Configuration and Metrics:")
print("Configuration:", best_config)
print("Metrics:", best_metrics)


## CNN and Random Forest (Hybrid)

In [None]:
# Importing necessary modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import numpy as np
import math

# Sample function to build and evaluate a hybrid CNN-Random Forest model
def build_and_evaluate_hybrid_model(look_back, split_ratio):
    # Feature Scaling
    scaler = MinMaxScaler()
    df_aggregated['Sales_scaled'] = scaler.fit_transform(df_aggregated[['Sales']])
    
    # Define look_forward
    look_forward = 7  # Number of future time steps to forecast
    
    # Data Sequencing
    X, Y = [], []
    for i in range(len(df_aggregated) - look_back - look_forward + 1):
        X.append(df_aggregated['Sales_scaled'][i:i+look_back].values)
        Y.append(df_aggregated['Sales_scaled'][i+look_back:i+look_back+look_forward].values)
    
    X, Y = np.array(X), np.array(Y)
    
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split_ratio, random_state=42)
    
    # Reshape input to be [samples, time steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    
    # CNN Model Building
    cnn_model = Sequential()
    cnn_model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Flatten())
    cnn_model.add(Dense(50, activation='relu'))
    cnn_model.add(Dense(look_forward))  # This should match with 'look_forward'
    cnn_model.compile(optimizer='adam', loss='mean_squared_error')
    
    # CNN Model Training
    cnn_model.fit(X_train, Y_train, epochs=50, batch_size=1, verbose=1)
    
    # Feature extraction with CNN
    feature_model = Sequential(cnn_model.layers[:-1])
    X_train_transformed = feature_model.predict(X_train)
    X_test_transformed = feature_model.predict(X_test)
    
    # Random Forest Model Building
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_transformed, Y_train)
    
    # Prediction with Random Forest
    Y_pred = rf_model.predict(X_test_transformed)
    
    # Inverse scaling
    Y_test_inv = scaler.inverse_transform(Y_test)
    Y_pred_inv = scaler.inverse_transform(Y_pred)
    
    # Evaluation
    metrics = {}
    metrics['MAE'] = mean_absolute_error(Y_test_inv, Y_pred_inv)
    metrics['MSE'] = mean_squared_error(Y_test_inv, Y_pred_inv)
    metrics['RMSE'] = math.sqrt(metrics['MSE'])
    metrics['R2'] = r2_score(Y_test_inv, Y_pred_inv)
    
    return metrics

# Different split_ratios to try
look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

# To store the best results
best_metrics = {'MAE': float('inf'), 'MSE': float('inf'), 'RMSE': float('inf'), 'R2': float('-inf')}
best_config = {}

# Loop through all combinations
for look_back in look_backs:
    for split_ratio in split_ratios:
        metrics = build_and_evaluate_hybrid_model(look_back, split_ratio)
        
        print(f"Look Back: {look_back}, Split Ratio: {1 - split_ratio}-{split_ratio*100}, Metrics: {metrics}")
        
        # Update best metrics and configuration if needed
        if metrics['MAE'] < best_metrics['MAE'] and metrics['MSE'] < best_metrics['MSE'] and metrics['R2'] > best_metrics['R2']:
            best_metrics = metrics
            best_config = {'look_back': look_back, 'split_ratio': split_ratio}

print("\nBest Configuration and Metrics:")
print("Configuration:", best_config)
print("Metrics:", best_metrics)


## LSTM and Random Forest (Hybrid)

In [None]:
# Importing necessary modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np
import math

# Sample function to build and evaluate a hybrid LSTM-Random Forest model
def build_and_evaluate_hybrid_model(look_back, split_ratio):
    # Feature Scaling
    scaler = MinMaxScaler()
    df_aggregated['Sales_scaled'] = scaler.fit_transform(df_aggregated[['Sales']])
    
    # Define look_forward
    look_forward = 7  # Number of future time steps to forecast
    
    # Data Sequencing
    X, Y = [], []
    for i in range(len(df_aggregated) - look_back - look_forward + 1):
        X.append(df_aggregated['Sales_scaled'][i:i+look_back].values)
        Y.append(df_aggregated['Sales_scaled'][i+look_back:i+look_back+look_forward].values)
    
    X, Y = np.array(X), np.array(Y)
    
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split_ratio, random_state=42)
    
    # Reshape input to be [samples, time steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    
    # LSTM Model Building
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
    lstm_model.add(Dense(look_forward))
    lstm_model.compile(optimizer='adam', loss='mean_squared_error')
    
    # LSTM Model Training
    lstm_model.fit(X_train, Y_train, epochs=50, batch_size=1, verbose=1)
    
    # Feature extraction with LSTM
    feature_model = Sequential(lstm_model.layers[:-1])
    X_train_transformed = feature_model.predict(X_train)
    X_test_transformed = feature_model.predict(X_test)
    
    # Random Forest Model Building
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_transformed, Y_train)
    
    # Prediction with Random Forest
    Y_pred = rf_model.predict(X_test_transformed)
    
    # Inverse scaling
    Y_test_inv = scaler.inverse_transform(Y_test)
    Y_pred_inv = scaler.inverse_transform(Y_pred)
    
    # Evaluation
    metrics = {}
    metrics['MAE'] = mean_absolute_error(Y_test_inv, Y_pred_inv)
    metrics['MSE'] = mean_squared_error(Y_test_inv, Y_pred_inv)
    metrics['RMSE'] = math.sqrt(metrics['MSE'])
    metrics['R2'] = r2_score(Y_test_inv, Y_pred_inv)
    
    return metrics

# Different split_ratios to try
look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

# To store the best results
best_metrics = {'MAE': float('inf'), 'MSE': float('inf'), 'RMSE': float('inf'), 'R2': float('-inf')}
best_config = {}

# Loop through all combinations
for look_back in look_backs:
    for split_ratio in split_ratios:
        metrics = build_and_evaluate_hybrid_model(look_back, split_ratio)
        
        print(f"Look Back: {look_back}, Split Ratio: {1 - split_ratio}-{split_ratio*100}, Metrics: {metrics}")
        
        # Update best metrics and configuration if needed
        if metrics['MAE'] < best_metrics['MAE'] and metrics['MSE'] < best_metrics['MSE'] and metrics['R2'] > best_metrics['R2']:
            best_metrics = metrics
            best_config = {'look_back': look_back, 'split_ratio': split_ratio}

print("\nBest Configuration and Metrics:")
print("Configuration:", best_config)
print("Metrics:", best_metrics)


## Gated Recurrent Unit

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
import numpy as np
import matplotlib.pyplot as plt
import math

def build_and_evaluate_gru_model(df_aggregated, look_back, split_ratio):
    # Feature Scaling
    scaler = MinMaxScaler()
    df_aggregated['Sales_scaled'] = scaler.fit_transform(df_aggregated[['Sales']])
    
    # Define look_forward
    look_forward = 7
    
    # Data Sequencing
    X, Y = [], []
    for i in range(len(df_aggregated) - look_back - look_forward + 1):
        X.append(df_aggregated['Sales_scaled'][i:i+look_back])
        Y.append(df_aggregated['Sales_scaled'][i+look_back:i+look_back+look_forward])
        
    X, Y = np.array(X), np.array(Y)
    
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split_ratio, random_state=42)
    
    # Reshape input
    X_train = X_train.reshape((X_train.shape[0], look_back, 1))
    X_test = X_test.reshape((X_test.shape[0], look_back, 1))
    
    # Build GRU model
    model = Sequential()
    model.add(GRU(50, activation='relu', input_shape=(look_back, 1)))
    model.add(Dense(look_forward))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train the model
    model.fit(X_train, Y_train, epochs=50, batch_size=1, verbose=1)
    
    # Predict
    Y_pred = model.predict(X_test)
    
    # Inverse transform
    Y_pred_inv = scaler.inverse_transform(Y_pred)
    Y_test_inv = scaler.inverse_transform(Y_test)
    
    # Evaluate
    metrics = {
        'MAE': mean_absolute_error(Y_test_inv, Y_pred_inv),
        'MSE': mean_squared_error(Y_test_inv, Y_pred_inv),
        'RMSE': math.sqrt(mean_squared_error(Y_test_inv, Y_pred_inv)),
        'R2': r2_score(Y_test_inv, Y_pred_inv)
    }
    
    plt.figure(figsize=(12, 6))
    plt.plot(Y_test_inv[0], label='Actual', marker='o')
    plt.plot(Y_pred_inv[0], label='Predicted', marker='x')
    plt.title('Actual vs Predicted Sales')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()

    # Debugging
    last_known_seq = X[-1:]
    
    last_known_seq = np.reshape(last_known_seq, (last_known_seq.shape[0], last_known_seq.shape[1], 1))
    
    future_sales_scaled = model.predict(last_known_seq)
    future_sales = scaler.inverse_transform(future_sales_scaled)
    
    # Plotting the forecasted sales
    plt.figure(figsize=(12, 6))
    plt.plot(future_sales[0], label='Forecasted Sales', marker='s')
    plt.title('Forecasted Sales for Next ' + str(look_forward) + ' Days')
    plt.ylabel('Sales')
    plt.xlabel('Future Time Steps')
    plt.legend()
    plt.show()
    
    
    return metrics

look_backs = [7]
split_ratios = [0.4, 0.3, 0.2]

best_metrics = {'MAE': float('inf'), 'MSE': float('inf'), 'RMSE': float('inf'), 'R2': -1}

for look_back in look_backs:
    for split_ratio in split_ratios:
        metrics = build_and_evaluate_gru_model(df_aggregated, look_back, split_ratio)
        print(f"Look Back: {look_back}, Split Ratio: {split_ratio}, Metrics: {metrics}")
        
        if metrics['R2'] > best_metrics['R2']:
            best_metrics = metrics
            best_config = {'look_back': look_back, 'split_ratio': split_ratio}

print("\nBest Configuration and Metrics:")
print("Configuration:", best_config)
print("Metrics:", best_metrics)
