In [3]:
import pandas as pd

# Load dataset into a pandas DataFrame
df = pd.read_csv('ewmatrainfile1.csv')  # Replace 'your_dataset.csv' with the actual file path

# Define lag features for 11 timesteps
lag_features = list(range(1, 11))  # Lag features for previous 10 timesteps (1 to 10)

# Perform any necessary preprocessing steps here
# For example, you might want to handle missing values, encode categorical variables, etc.


# Assuming df is your original DataFrame

# Sample 150,000 rows randomly from the DataFrame
train_df = df.sample(n=150000)



# Display the first few rows of the resulting DataFrame
print(train_df.head())

# Display basic information about the dataset

print("Columns in the dataset:", train_df.columns)
print("Data types of columns:\n", train_df.dtypes)
print("First few rows of the dataset:\n", train_df.head())



             id  week  center_id  meal_id  checkout_price  base_price  \
216989  1422385    72         26     1971          204.70      310.43   
262257  1385556    86        104     1993          143.59      144.59   
421910  1342353   135        106     1993          100.88      111.61   
84964   1031348    30         74     1993           95.12       95.12   
257535  1109119    85        146     2631           94.09      141.62   

        emailer_for_promotion  homepage_featured        EWMA  
216989                      0                  1  194.173783  
262257                      0                  1  905.273154  
421910                      0                  1  492.969260  
84964                       0                  0  243.842906  
257535                      1                  1  260.651358  
Columns in the dataset: Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'EWMA'],
      dtype='object'

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split


# Split the data into features (X) and target variable (y)
X = train_df.drop(columns=['EWMA'])
y = train_df['EWMA']

# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, shuffle=False)

# Normalize data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM
num_timesteps = 10  # Number of timesteps
num_features = X_train_scaled.shape[1]  # Number of input features

def reshape_data(train_df):
    num_samples = train_df.shape[0] - num_timesteps + 1
    reshaped_data = np.zeros((num_samples, num_timesteps, num_features))
    for i in range(num_samples):
        reshaped_data[i] = train_df[i:i+num_timesteps]
    return reshaped_data

X_train_reshaped = reshape_data(X_train_scaled)
X_val_reshaped = reshape_data(X_val_scaled)
X_test_reshaped = reshape_data(X_test_scaled)

# Define LSTM model architecture with 3 layers
lstm_model = Sequential()
lstm_model.add(LSTM(units=64, input_shape=(num_timesteps, num_features), return_sequences=True))
lstm_model.add(Dropout(0.25))
lstm_model.add(LSTM(units=32, return_sequences=True))
lstm_model.add(Dropout(0.25))
lstm_model.add(LSTM(units=16, return_sequences=False))
lstm_model.add(Dropout(0.25))
lstm_model.add(Dense(units=1))

# Compile LSTM model
lstm_model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train LSTM model
lstm_model.fit(X_train_reshaped, y_train[:-num_timesteps+1], epochs=100, batch_size=128, validation_data=(X_val_reshaped, y_val[:-num_timesteps+1]), shuffle=False, verbose=1)

# Define BiLSTM model architecture
bilstm_model = Sequential()
bilstm_model.add(Bidirectional(LSTM(units=64, return_sequences=False), input_shape=(num_timesteps, num_features)))
bilstm_model.add(Dropout(0.25))
bilstm_model.add(Bidirectional(LSTM(units=32, return_sequences=False), input_shape=(num_timesteps, num_features)))
bilstm_model.add(Dropout(0.25))
bilstm_model.add(Dense(units=1))

# Compile BiLSTM model
bilstm_model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train BiLSTM model
bilstm_model.fit(X_train_reshaped, y_train[:-num_timesteps+1], epochs=100, batch_size=128, validation_data=(X_val_reshaped, y_val[:-num_timesteps+1]), shuffle=False, verbose=1)

# Get summary of LSTM model
print("LSTM Model Summary:")
print(lstm_model.summary())

# Get summary of BiLSTM model
print("BiLSTM Model Summary:")
print(bilstm_model.summary())



KeyboardInterrupt: 

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.metrics import r2_score

# Make predictions
lstm_predictions = lstm_model.predict(X_test_reshaped)
bilstm_predictions = bilstm_model.predict(X_test_reshaped)

# Calculate ensemble predictions (average of LSTM and BiLSTM predictions)
ensemble_predictions = (lstm_predictions + bilstm_predictions) / 2.0

# Calculate errors
rmse = np.sqrt(mean_squared_error(y_test[num_timesteps-1:], ensemble_predictions))
rmsle = np.sqrt(mean_squared_log_error(y_test[num_timesteps-1:], ensemble_predictions))
mae = mean_absolute_error(y_test[num_timesteps-1:], ensemble_predictions)
mape = np.mean(np.abs((y_test[num_timesteps-1:] - ensemble_predictions) / y_test[num_timesteps-1:]) * 100)
r_squared = r2_score(y_test[num_timesteps-1:], ensemble_predictions)

print("Ensemble RMSLE:", rmsle)
print("Ensemble RMSE:", rmse)
print("Ensemble MAE:", mae)
print("Ensemble MAPE:", mape)
print("R-squared:", r_squared)


In [None]:
import pickle

# Define the file path where you want to save the model
file_path = "ensemble_model.pkl"

# Save the ensemble model to the pickle file
with open(file_path, 'wb') as f:
    pickle.dump((lstm_model, bilstm_model), f)

print("Ensemble model saved to", file_path)
