##### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##### Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
# from statsmodels.tsa.stattools import adfuller
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from datetime import datetime

##### Load the data from Google Drive

In [None]:
train_df = pd.read_csv('/content/drive/My Drive/rossmann-store-sales/train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/My Drive/rossmann-store-sales/test.csv', low_memory=False)
store_df = pd.read_csv('/content/drive/My Drive/rossmann-store-sales/store.csv', low_memory=False)

##### Merge train and test data with store data

In [None]:
train_data = train_df.merge(store_df, on='Store', how='left')
test_data = test_df.merge(store_df, on='Store', how='left')

##### Drop unnecessary columns and Handle missing values

In [None]:
train_data.drop(["Customers", "PromoInterval"], axis=1, inplace=True)
test_data.drop(["Id", "PromoInterval"], axis=1, inplace=True)
 
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

#### Preprocessing

##### Extract Features from Datetime Columns

In [None]:
def extract_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['IsWeekend'] = df['Date'].dt.dayofweek >= 5
    df['IsMonthStart'] = df['Date'].dt.is_month_start
    df['IsMonthEnd'] = df['Date'].dt.is_month_end
    df.drop('Date', axis=1, inplace=True)
    return df

train_data = extract_date_features(train_data)
test_data = extract_date_features(test_data)

##### Encode categorical variables

In [None]:
train_data['StateHoliday'] = train_data['StateHoliday'].replace({'0': 0, 'a': 1, 'b': 2, 'c': 3}).astype(int)
train_data['StoreType'] = train_data['StoreType'].map({'a': 1, 'b': 2, 'c': 3, 'd': 4})
train_data['Assortment'] = train_data['Assortment'].map({'a': 1, 'b': 2, 'c': 3})

test_data['StateHoliday'] = test_data['StateHoliday'].replace({'0': 0, 'a': 1, 'b': 2, 'c': 3}).astype(int)
test_data['StoreType'] = test_data['StoreType'].map({'a': 1, 'b': 2, 'c': 3, 'd': 4})
test_data['Assortment'] = test_data['Assortment'].map({'a': 1, 'b': 2, 'c': 3})

##### Scale the Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_features = train_data.drop('Sales', axis=1)
train_labels = train_data['Sales']
scaled_train_features = scaler.fit_transform(train_features)

test_features = test_data
scaled_test_features = scaler.transform(test_features)

# Convert scaled features back to DataFrame for convenience
scaled_train_df = pd.DataFrame(scaled_train_features, columns=train_features.columns)
scaled_test_df = pd.DataFrame(scaled_test_features, columns=test_features.columns)

##### Building Models with Sklearn Pipelines

In [None]:
# Define and Train the Model by Using RandomForrestRegressor within Sklearn Pipelines
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(scaled_train_df, train_labels, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Validation MSE: {mse}")

# Train the model on the full training set
pipeline.fit(scaled_train_df, train_labels)

# Predict on the test set
test_predictions = pipeline.predict(scaled_test_df)

In [None]:
#Choose a Loss Function

from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

##### Post Prediction Analysis

In [None]:
# Feature Importance
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

importances = pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': train_features.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.show()

In [None]:

# Confidence Interval Estimation

# Estimating confidence intervals using bootstrapping
# Number of bootstrap samples
from sklearn.utils import resample

n_bootstraps = 1
bootstrap_preds = np.zeros((n_bootstraps, len(X_val)))

# Generate bootstrap samples, train, and predict
for i in range(n_bootstraps):
    X_train_bootstrap, y_train_bootstrap = resample(X_train, y_train, random_state=i)
    pipeline.fit(X_train_bootstrap, y_train_bootstrap)
    bootstrap_preds[i] = pipeline.predict(X_val)

# Calculate the confidence intervals
lower_percentile = 2.5
upper_percentile = 97.5
lower_bound = np.percentile(bootstrap_preds, lower_percentile, axis=0)
upper_bound = np.percentile(bootstrap_preds, upper_percentile, axis=0)

# Calculate the mean predictions
y_pred_mean = np.mean(bootstrap_preds, axis=0)

# Display the results
results_df = pd.DataFrame({
    'Actual': y_val,
    'Predicted Mean': y_pred_mean,
    'Lower Bound': lower_bound,
    'Upper Bound': upper_bound
})

print(results_df.head())

In [None]:
# Serialize Models

import joblib
from datetime import datetime

# Get the current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

# Define the file path
model_filename = f'model_{timestamp}.pkl'

# Save the model
joblib.dump(pipeline, model_filename)

print(f'Model saved as {model_filename}')

##### Building a Deep Learning Model with LSTM deep leatning model

In [None]:
# Isolate the Time Series Data

import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

# Select relevant columns
time_series_df = train_data[['Date', 'Sales']].sort_values(by='Date')

# Set the Date column as the index
time_series_df.set_index('Date', inplace=True)

# Check if the time series is stationary
def check_stationarity(series):
    from statsmodels.tsa.stattools import adfuller
    result = adfuller(series)
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    return result[1] <= 0.05

# Check stationarity
is_stationary = check_stationarity(time_series_df['Sales'])

# Difference the data if not stationary
if not is_stationary:
    time_series_df['Sales'] = time_series_df['Sales'].diff().dropna()

# Check for autocorrelation and partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(time_series_df['Sales'])
plot_pacf(time_series_df['Sales'])
plt.show()

# Scale data between -1 and 1
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = scaler.fit_transform(time_series_df['Sales'].values.reshape(-1, 1))

# Create supervised learning data
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

time_step = 10
X, y = create_dataset(scaled_data, time_step)

# Re
shape input to [samples, time steps, features] for LSTM
X = X.reshape(X.shape[0], X.shape[1], 1)

In [None]:
# Build and Train the LSTM Model

# Define the LSTM model
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(tf.keras.layers.LSTM(50))
model.add(tf.keras.layers.Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X, y, epochs=20, batch_size=1, verbose=2)

# Save the model
model_filename = f'lstm_model-{timestamp}.h5'
model.save(model_filename)
print(f'LSTM model saved as {model_filename}')

In [None]:
# Making Predictions

# Load the model
loaded_model = tf.keras.models.load_model(model_filename)

# Prepare data for prediction (e.g., last `time_step` days)
last_data = scaled_data[-time_step:].reshape(1, time_step, 1)

# Make a prediction
predicted_sales = loaded_model.predict(last_data)

# Inverse transform the prediction
predicted_sales = scaler.inverse_transform(predicted_sales)
print(f'Predicted Sales: {predicted_sales[0][0]}')