# Predictions on a web interface

Building the Model for Web Application Deployment

We will now build the model using the reduced_df dataset, specifically for deployment within a web application.

In [1]:
import pandas as pd

reduced_df = pd.read_csv('reduced_df.csv')

reduced_df

Unnamed: 0.1,Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,...,Promo_3,Promo_4,Promo_5,Promo_6,Promo_7,Promo_8,Promo_9,Promo_10,Promo_11,Promo_12
0,199858,-0.883573,-1.501129,2015-02-02,-0.330889,1.013442,1,1,0,0,...,0,1,0,0,1,0,0,1,0,0
1,201314,0.175734,1.502791,2015-02-01,-1.582625,-1.572631,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,278203,0.818774,-1.501129,2014-11-17,0.076252,-0.188245,1,0,0,0,...,1,0,0,1,0,0,1,0,0,1
3,467745,-1.017152,0.000831,2014-05-08,-0.012184,-0.489311,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0
4,306055,0.067007,1.002138,2014-10-18,0.600752,1.594987,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508599,167886,0.244076,-1.000476,2015-03-03,0.881635,0.210602,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0
508600,815306,1.455601,-1.501129,2013-07-01,2.324566,2.248582,1,1,0,0,...,0,1,0,0,1,0,0,1,0,0
508601,843056,1.067292,0.000831,2013-06-06,-0.367320,-0.031280,1,1,0,0,...,0,1,0,0,1,0,0,1,0,0
508602,37174,-0.554287,1.502791,2015-06-28,-1.582625,-1.572631,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1


In [2]:
from sklearn.model_selection import train_test_split

# Defining features and target
X = reduced_df.drop(['Sales', 'Date', 'StoreType', 'Assortment', 'PromoInterval', 'StateHoliday'], axis=1)
y = reduced_df['Sales']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the split data
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (406883, 30)
X_test shape: (101721, 30)
y_train shape: (406883,)
y_test shape: (101721,)


The dataset has been divided into training and testing sets, comprising 406,883 samples for training and 101,721 samples for testing

In [3]:
# Now again, let's create a pipeline that includes the Random Forest Regressor. 
# We'll train the model using the training data and then evaluate its performance on the test set. 

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Creating the pipeline with a Random Forest Regressor
pipeline = Pipeline([
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Training the model
pipeline.fit(X_train, y_train)

# Predicting on the test set
y_pred = pipeline.predict(X_test)


#2.3 Choosing a loss function (Mean Squared Error (MSE))
from sklearn.metrics import mean_squared_error, r2_score

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(0.014368724626788912, 0.9856351692992715)

MSE close to 0 and an R-squared value close to 1 together indicate that the model is performing very well, with high predictive accuracy and a strong fit to the data.

Serialize the model

In [4]:
import joblib
from datetime import datetime

# Save the model
model = pipeline
timestamp = datetime.now().strftime("%m-%d-%Y-%H-%M-%S-00")
filename = f"Random_forest_model_{timestamp}.pkl"

with open(filename, 'wb') as file:
    joblib.dump(model, file)

print(f"Model saved as {filename}")

Model saved as Random_forest_model_11-14-2024-14-16-52-00.pkl


Load the model

In [1]:
import joblib

# Load the model from the file
with open('Random_forest_model_11-14-2024-14-16-52-00.pkl', 'rb') as file:
    loaded_model = joblib.load(file)

# loaded_model is now ready to use

# Check if the model is loaded correctly
print(type(loaded_model))


<class 'sklearn.pipeline.Pipeline'>


In [2]:
loaded_model