## In-Class Hotel Demand Room Nights Prediction

### Feature Engineering, Pipeline Development, & Model Training with Scikit-Learn

<b>Ryan Paul Lafler, M.Sc.</b>

In [1]:
import pandas as pd

In [2]:
# Read in Hotel Bookings CSV
bookings_df = pd.read_csv(
    "hotel_bookings.csv"
)
bookings_df.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
bookings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

### Removing Missing Values from Certain Columns

In [4]:
# Remove missing values in specific columns:
no_na_df = bookings_df.dropna(
    subset=[
        "arrival_date_year", "arrival_date_month", "arrival_date_day_of_month",
        "reservation_status_date", "country", "children"
    ]
)
no_na_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118898 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           118898 non-null  object 
 1   is_canceled                     118898 non-null  int64  
 2   lead_time                       118898 non-null  int64  
 3   arrival_date_year               118898 non-null  int64  
 4   arrival_date_month              118898 non-null  object 
 5   arrival_date_week_number        118898 non-null  int64  
 6   arrival_date_day_of_month       118898 non-null  int64  
 7   stays_in_weekend_nights         118898 non-null  int64  
 8   stays_in_week_nights            118898 non-null  int64  
 9   adults                          118898 non-null  int64  
 10  children                        118898 non-null  float64
 11  babies                          118898 non-null  int64  
 12  meal                 

In [5]:
%%time
# ##################################### #
# CLASS: TARGET FEATURE EXTRACTION
# ##################################### #

class Booking_Preprocessing :

    def __init__(self, df) :
        self.df = df

    def preprocess_df(self) :
        no_na_df = bookings_df.dropna(
            subset=[
                "arrival_date_year", "arrival_date_month", "arrival_date_day_of_month",
                "reservation_status_date", "country", "children"
            ]
        )

        # Filter DataFrame to KEEP ONLY People that did *NOT* Cancel:
        no_na_df = no_na_df.loc[no_na_df["is_canceled"] == 0]  # Reservation did NOT cancel

        no_na_df = no_na_df.drop(
            columns=[
                "is_canceled", "company", "agent", "reservation_status", "assigned_room_type",
            ],
        )
        self.no_na_df = no_na_df
        return no_na_df

    def number_of_nights(self) :
        # Check In Time Creation
        check_in_time = self.no_na_df["arrival_date_year"].astype(str) + "-" + \
            self.no_na_df["arrival_date_month"].astype(str) + "-" + \
            self.no_na_df["arrival_date_day_of_month"].astype(str)
        
        # Conversion from String --> DateTime
        check_in_time = pd.to_datetime(
            check_in_time,
            format="%Y-%B-%d",
            errors="coerce",
            exact=False
        )

        # Check-Out Time:
        check_out_time = pd.to_datetime(
            self.no_na_df["reservation_status_date"],
            format="%Y-%m-%d",
            exact=False,
            errors="coerce",
        )

        # Engineer our Length of Stay (Nights at a Hotel):
        num_nights = check_out_time - check_in_time
        num_nights = num_nights.dt.days  # Get the length of stay in days
        
        return num_nights


# Instantiation (Creating an Instance of the Class)
booking_process = Booking_Preprocessing(df=bookings_df)
no_na_df = booking_process.preprocess_df()
no_na_df = no_na_df.drop(
    columns=["reservation_status_date", "arrival_date_day_of_month"]
)
y = booking_process.number_of_nights()  # Response (target) feature

y.head(20)

CPU times: user 113 ms, sys: 9.22 ms, total: 122 ms
Wall time: 123 ms


0     0
1     0
2     1
3     1
4     2
5     2
6     2
7     2
11    4
12    4
13    4
14    4
15    4
16    4
17    1
18    1
19    4
20    5
21    6
22    6
dtype: int64

### Create Training & Testing Partitions

In [6]:
from sklearn.model_selection import train_test_split

# Designating Set of Predictors and the Response Variable
X, y = no_na_df, y

# Training/ Testing Split:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    shuffle=True, random_state=777,  # Setting a random seed for replication
    test_size=0.15  # 15% of data reserved for testing set
)

X_test

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
22921,Resort Hotel,5,2016,April,15,0,1,2,0.0,0,...,0,0,A,0,No Deposit,0,Transient,48.00,0,0
29901,Resort Hotel,45,2016,November,45,2,2,2,0.0,0,...,0,0,A,0,No Deposit,0,Transient,52.40,0,0
90491,City Hotel,121,2016,June,23,2,2,2,0.0,0,...,0,0,D,0,No Deposit,0,Transient,90.95,0,0
43720,City Hotel,34,2015,September,39,0,3,2,0.0,0,...,0,0,A,0,No Deposit,33,Transient-Party,224.67,0,0
118426,City Hotel,186,2017,August,33,0,3,2,0.0,0,...,0,0,A,0,No Deposit,0,Transient,89.76,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106712,City Hotel,8,2017,February,9,1,2,2,0.0,0,...,0,0,A,0,No Deposit,0,Transient,117.00,0,0
19952,Resort Hotel,2,2016,January,3,1,0,1,0.0,0,...,0,0,A,0,No Deposit,0,Transient,25.00,0,0
29870,Resort Hotel,11,2016,November,45,2,5,2,0.0,0,...,0,0,E,2,No Deposit,0,Transient,97.00,0,1
36628,Resort Hotel,17,2017,May,21,0,1,2,0.0,0,...,0,0,A,0,No Deposit,0,Transient,85.00,0,2


### Aggregating Small Categories into Useful Categories

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class Aggregate_Categories(BaseEstimator, TransformerMixin) :

    # Initialization Method
    def __init__(self, columns, prop=0.005, value="other") :
        # Class remembers these properties when called
        self.columns = columns
        self.value = value
        self.prop = prop


    # Fit Method for Scikit-Learn
    def fit(self, X, y=None) :
        return self


    # Transform Method for Scikit-Learn
    def transform(self, X, y=None) :
        column_names = X.columns.values.tolist()  # Retrieve all column names & return them as a LIST

        # Transforming small categories into a larger, single category:
        for column in column_names:
            X[column] = X[column].astype(str) ## Coercion to String
            groups = X[column].value_counts() / X[column].value_counts().sum()  # Calculate relative frequency
            # DataFrame Series --> Column of the DataFrame
            small_groups = groups[groups < self.prop]  # Relative frequency LESS THAN 0.5%
            small_groups = small_groups.index.tolist()  # Convert category names to a list
            X[column] = pd.Categorical(
                X[column].replace(
                    to_replace=small_groups,
                    value=self.value,
                )
            )
        return X  # Return categorical column with aggregated groups

        
# Create categorical features:
cat_features = X.select_dtypes(include=["object"])
cat_features["arrival_date_year"] = X["arrival_date_year"]
cat_features["is_repeated_guest"] = X["is_repeated_guest"]
cat_features["previous_cancellations"] = X["previous_cancellations"]
cat_features["previous_bookings_not_canceled"] = X["previous_bookings_not_canceled"]

cat_features.head(5)

# Categorical Pipeline:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

cat_pipe = Pipeline(
    steps=[
        ("aggregation-stage", Aggregate_Categories(
            columns=cat_features.columns.tolist(),
            prop=0.01,
            value="other"
            )
        ),
        ("one-hot-encode", OneHotEncoder(handle_unknown="ignore"))
    ]
)
cat_pipe

encoded_cat_features = cat_pipe.fit_transform(cat_features)
print(X["reserved_room_type"].unique())
#print(encoded_cat_features["reserved_room_type"].unique())
encoded_cat_features

cat_pipe
 

['C' 'A' 'D' 'G' 'E' 'F' 'H' 'L' 'B']


### Numeric Pipeline

In [8]:
# Define a Numeric Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

numeric_features = X[X.columns[~ X.columns.isin(cat_features.columns.tolist())]]
numeric_features

num_pipe = Pipeline(
    steps=[
        ("imputation-median", SimpleImputer(strategy="median")),
        ("standardization", StandardScaler()),
    ]
)

num_pipe

### Bringing the Pipes Together --> with ColumnTransformer

In [9]:
from sklearn.compose import ColumnTransformer

# Bring it all together:
preprocessing_pipe = ColumnTransformer(
    [
        ("numeric-pipe", num_pipe, numeric_features.columns.tolist()),
        ("categorical-pipe", cat_pipe, cat_features.columns.tolist()),
    ]
)

preprocessing_pipe

### Fitting the Elastic Net Model

In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV

# Fit Model to the pipeline:
model = Pipeline(
    steps=[
        ("preprocessing-pipe", preprocessing_pipe),
        ("elastic-net-model", ElasticNet(l1_ratio=0.2))
    ]
)

model

### Training the Model & Evaluating it

In [11]:
%%time
# Model training process
model.fit(X_train, y_train)

CPU times: user 2.23 s, sys: 5.49 s, total: 7.73 s
Wall time: 1.13 s


In [12]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
import numpy as np

# Get predictions from testing set
y_pred = model.predict(X_test)

np.around(y_pred[0:20]).astype(int)

print(f"RMSE is: {np.sqrt(MSE(y_test, y_pred))}")
print(f"\nR^2 is: {r2_score(y_test, y_pred) * 100} %")

# Compare Test Observations to Predicted Observations
print(f"\n{np.around(y_pred[0:100]).astype(int)}\n")
print(f"{y_test[0:100].values}")


RMSE is: 1.0671434194906797

R^2 is: 82.69725666262839 %

[ 2  4  4  3  3  3  3  3  4  6  2  4  2  3  4  6  3  3  3  3  2  4  3  3
  6  3  2  3  6  3  2  3  3  2  4  4  2  3  3  2  2  4  3  3  3  4  2  3
  4  3  3  2  6  6  2  4  3  4  2  3 14  2  5  4  7  4  3  2  5  2  4  2
  3  3  6  2  3  4  6  6  2  3  2  3  3  3  6  2  4  2  2  4  3  4  2  2
  4  2  4  3]

[ 1  4  4  3  3  2  3  2  4  8  1  4  2  3  5  7  3  3  2  3  1  5  3  2
  7  2  2  3  7  2  1  2  2  1  4  4  1  2  3  2  2  4  2  2  3  4  1  2
  4  3  2  1  7  7  2  5  3  4  1  3 21  1  6  4  9  4  3  2  7  1  4  1
  3  3  7  2  3  4  8  7  1  2  1  3  2  2  7  1  4  2  1  4  3  4  1  1
  4  1  4  3]


## March 26, 2024: Hyperparameter Fine-Tuning, Cross-Validation, Model Performance, & Pickling

### Fine-Tuning the L1-Ratio Hyperparameter of the Elastic Net with Grid Search Cross-Validation

In [25]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone
import numpy as np

# Specify the hyperparameter search space  --> only examining the l1-ratio
hyperparam_grid = {
    "elastic-net-model__l1_ratio": np.arange(0.001, 0.99, 0.05)  # Do NOT set l1_ratio directly equal to 0 or 1 --> will cause issues with convergence !
}

# Create a Pipeline containing the Grid-Search with Cross-Validation
model = Pipeline(
    steps=[
        ("preprocessing-pipe", preprocessing_pipe),
        ("elastic-net-model", ElasticNet())
    ]
)

# Add Grid Search 5-Folds Cross-Validation to the Pipeline
model_grid_cv = GridSearchCV(
    model,
    cv=5,
    param_grid=hyperparam_grid
)

# Fit the training, using 5-Folds CV, for all combinations of hyperparameters:
model_grid_cv.fit(X_train, y_train)

CPU times: user 2min 38s, sys: 4min 51s, total: 7min 30s
Wall time: 1min 11s


### Evaluate the Cross-Validation Metrics form the Resulting Grid Search

In [26]:
# Examine the best hyperparameter extracted from the training process
model_grid_cv.best_params_

{'elastic-net-model__l1_ratio': 0.001}

Store the results from each possible hyperparameter, tested on each CV-fold, inside of a Pandas DataFrame and retrieve the most optimal hyperparameter by taking the average of CV scores for each row (with each row representing a unique l1_ratio value).

In [29]:
# Extract all Cross-Validation results:
cv_results = model_grid_cv.cv_results_

# Store the results in a Pandas DataFrame:
cv_results_df = pd.DataFrame(
    cv_results
)

# 100-Total Elastic Nets Made:
cv_results_df  # 5-Seperate Models, trained on 20 unique l1_ratio hyperparameter values = 100 TOTAL ELASTIC NETS CREATED

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_elastic-net-model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.597589,0.059501,0.148481,0.004005,0.001,{'elastic-net-model__l1_ratio': 0.001},0.834041,0.834488,0.835007,0.834499,0.837072,0.835021,0.00107,1
1,0.629439,0.152511,0.149384,0.003563,0.051,{'elastic-net-model__l1_ratio': 0.051000000000...,0.831412,0.83033,0.831722,0.831043,0.833303,0.831562,0.000987,2
2,0.54845,0.026822,0.154607,0.008021,0.101,{'elastic-net-model__l1_ratio': 0.101},0.828644,0.826951,0.828703,0.827967,0.830086,0.82847,0.001025,3
3,0.558285,0.067646,0.150472,0.006704,0.151,{'elastic-net-model__l1_ratio': 0.151000000000...,0.826957,0.824738,0.826504,0.825937,0.827702,0.826368,0.000999,4
4,0.521566,0.007378,0.15203,0.00939,0.201,{'elastic-net-model__l1_ratio': 0.201},0.825591,0.823187,0.824865,0.824495,0.826024,0.824832,0.000981,5
5,0.530201,0.010474,0.151753,0.003354,0.251,{'elastic-net-model__l1_ratio': 0.251},0.824025,0.82141,0.82302,0.822855,0.824104,0.823083,0.000978,6
6,0.527572,0.005047,0.149483,0.007696,0.301,{'elastic-net-model__l1_ratio': 0.301000000000...,0.822221,0.819372,0.820942,0.820987,0.821952,0.821095,0.001001,7
7,0.530115,0.009469,0.152912,0.005877,0.351,{'elastic-net-model__l1_ratio': 0.351000000000...,0.820867,0.818198,0.819536,0.819607,0.820853,0.819812,0.000992,8
8,0.525207,0.010595,0.145495,0.002205,0.401,{'elastic-net-model__l1_ratio': 0.401},0.81955,0.816988,0.818071,0.818233,0.819696,0.818508,0.001007,9
9,0.531561,0.017093,0.145559,0.004334,0.451,{'elastic-net-model__l1_ratio': 0.451},0.818161,0.815711,0.816527,0.816784,0.818473,0.817131,0.001036,10


Take the average of scores across each fold, moving across each row

In [43]:
# Extract the columns containing scores for each training-fold:
cv_scores = cv_results_df[["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score"]]

# Calculate the average score across all training-folds, for each specified hyperparameter:
cv_scores["average_score"] = cv_scores.apply(lambda x: np.nanmean(x), axis=1)
cv_scores["sd_score"] = cv_scores.apply(lambda x: np.nanstd(x), axis=1)
cv_scores["hyperparams"] = cv_results_df["params"]

# Sort the DataFrame in descending order by its average CV scores:
cv_scores = cv_scores.sort_values(by=["average_score"], ascending=False)
cv_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_scores["average_score"] = cv_scores.apply(lambda x: np.nanmean(x), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_scores["sd_score"] = cv_scores.apply(lambda x: np.nanstd(x), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_scores["hyperparams"] = cv_results_df["params"]


Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,average_score,sd_score,hyperparams
0,0.834041,0.834488,0.835007,0.834499,0.834509,0.000306,{'elastic-net-model__l1_ratio': 0.001}
1,0.831412,0.83033,0.831722,0.831043,0.831127,0.000465,{'elastic-net-model__l1_ratio': 0.051000000000...
2,0.828644,0.826951,0.828703,0.827967,0.828066,0.000631,{'elastic-net-model__l1_ratio': 0.101}
3,0.826957,0.824738,0.826504,0.825937,0.826034,0.000743,{'elastic-net-model__l1_ratio': 0.151000000000...
4,0.825591,0.823187,0.824865,0.824495,0.824535,0.00078,{'elastic-net-model__l1_ratio': 0.201}
5,0.824025,0.82141,0.82302,0.822855,0.822827,0.000834,{'elastic-net-model__l1_ratio': 0.251}
6,0.822221,0.819372,0.820942,0.820987,0.820881,0.000904,{'elastic-net-model__l1_ratio': 0.301000000000...
7,0.820867,0.818198,0.819536,0.819607,0.819552,0.000845,{'elastic-net-model__l1_ratio': 0.351000000000...
8,0.81955,0.816988,0.818071,0.818233,0.818211,0.000814,{'elastic-net-model__l1_ratio': 0.401}
9,0.818161,0.815711,0.816527,0.816784,0.816796,0.000789,{'elastic-net-model__l1_ratio': 0.451}


### Extract the Best Estimator from the Grid Searcg and Save that Model as a `Pickeled` Object

In [48]:
from joblib import dump, load

# Extract the most optimal Elastic Net model, *with its pre-processing pipeline*, from the Grid Search:
best_model = model_grid_cv.best_estimator_

# Dump (save) the model to the same directory:
dump(best_model, "optimized-elastic-net.joblib")

# Load in this model, to examine it:
loaded_model = load("optimized-elastic-net.joblib")
loaded_model  # View this imported model

### Add Polynomial (Higher-Order) & interaction Terms to the Elastic Net

In [73]:
%%time
from sklearn.preprocessing import PolynomialFeatures

# NUMERIC PIPE
num_pipe = Pipeline(
    steps=[
        ("imputation-median", SimpleImputer(strategy="median")),
        ("higher-order-terms", PolynomialFeatures(degree=2)),
        ("standardization", StandardScaler()),
    ]
)

# CATEGORICAL PIPE
cat_pipe = Pipeline(
    steps=[
        ("aggregation-stage", Aggregate_Categories(
            columns=cat_features.columns.tolist(),
            prop=0.01,
            value="other"
            )
        ),
        ("one-hot-encode", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# PRE-PROCESSING PIPE
preprocessing_pipe = ColumnTransformer(
    [
        ("numeric-pipe", num_pipe, numeric_features.columns.tolist()),
        ("categorical-pipe", cat_pipe, cat_features.columns.tolist()),
    ]
)

# FIT MODEL TO THE PIPELINE
model = Pipeline(
    steps=[
        ("preprocessing-pipe", preprocessing_pipe),
        ("elastic-net-model", ElasticNet())
    ]
)

# Specify the hyperparameter search space  --> only examining the l1-ratio
hyperparam_grid = {
    "elastic-net-model__l1_ratio": np.arange(0.001, 0.99, 0.1)  # Do NOT set l1_ratio directly equal to 0 or 1 --> will cause issues with convergence !
}

# Add Grid Search 5-Folds Cross-Validation to the Pipeline
model_grid_cv = GridSearchCV(
    model,
    cv=5,
    param_grid=hyperparam_grid
)

# Fit the training, using 5-Folds CV, for all combinations of hyperparameters:
model_grid_cv.fit(X_train, y_train)

CPU times: user 3min 42s, sys: 8min 6s, total: 11min 48s
Wall time: 1min 13s


In [74]:
# Extract all Cross-Validation results:
cv_results = model_grid_cv.cv_results_

# Store the results in a Pandas DataFrame:
cv_results_df = pd.DataFrame(
    cv_results
)

cv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_elastic-net-model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.835328,2.532207,0.164988,0.009346,0.001,{'elastic-net-model__l1_ratio': 0.001},0.946113,0.939743,0.950212,0.950299,0.942757,0.945825,0.004141,1
1,1.592553,0.808683,0.155675,0.003488,0.101,{'elastic-net-model__l1_ratio': 0.101},0.931048,0.923975,0.935031,0.934814,0.926742,0.930322,0.004381,2
2,1.399774,0.235038,0.164271,0.014579,0.201,{'elastic-net-model__l1_ratio': 0.201},0.918451,0.911209,0.922875,0.923154,0.914073,0.917952,0.004734,3
3,1.089555,0.250975,0.160094,0.012279,0.301,{'elastic-net-model__l1_ratio': 0.301000000000...,0.906894,0.89959,0.911249,0.912283,0.902614,0.906526,0.004878,4
4,1.014984,0.157374,0.152815,0.002549,0.401,{'elastic-net-model__l1_ratio': 0.401},0.892801,0.885375,0.896876,0.898808,0.888261,0.892424,0.005054,5
5,1.191946,0.143645,0.166148,0.010751,0.501,{'elastic-net-model__l1_ratio': 0.501},0.876116,0.868548,0.879625,0.882564,0.871109,0.875592,0.005194,6
6,0.904805,0.077174,0.167463,0.018093,0.601,{'elastic-net-model__l1_ratio': 0.601000000000...,0.857518,0.849989,0.860305,0.864228,0.852398,0.856887,0.005171,7
7,0.867353,0.05174,0.151253,0.005086,0.701,{'elastic-net-model__l1_ratio': 0.701000000000...,0.838045,0.83179,0.840282,0.844419,0.833992,0.837705,0.004486,8
8,0.885278,0.086274,0.156876,0.009007,0.801,{'elastic-net-model__l1_ratio': 0.801},0.818807,0.814903,0.819846,0.823501,0.817692,0.81895,0.002811,9
9,0.980195,0.202672,0.153253,0.004864,0.901,{'elastic-net-model__l1_ratio': 0.901},0.802506,0.801667,0.800718,0.803818,0.804974,0.802736,0.001513,10


### Evaluate the Model's Performance on the Test Set

In [85]:
from sklearn.metrics import mean_squared_error as MSE, r2_score
# Extract the optimal model from the Grid Search CV:
best_model = model_grid_cv.best_estimator_

# Caclulate the predicted number of nights stayed at the hotel:
y_pred = best_model.predict(
    X_test
)

# Metrics to measure performance on the test set:
print(f"RMSE: {np.sqrt(MSE(y_test, y_pred))}\n")
print(f"R^2: {r2_score(y_test, y_pred) * 100}%\n")

# Examine teh first-100 predicted values with their observed counterparts:
print(np.around(y_pred[0:100]).astype(int))
print(y_test[0:100].values)

RMSE: 0.5916345555157785

R^2: 94.68165571614365%

[ 2  4  4  3  3  2  3  2  4  6  2  4  2  3  5  6  3  3  2  3  1  5  3  2
  7  2  2  3  6  2  2  2  2  1  4  4  1  2  3  2  2  4  2  2  3  4  2  2
  4  3  2  1  7  7  2  5  3  3  1  3 16  2  6  4  8  4  3  2  6  1  4  2
  3  3  7  2  3  4  8  6  2  2  1  3  2  2  7  2  4  2  2  4  3  4  2  2
  4  2  3  3]
[ 1  4  4  3  3  2  3  2  4  8  1  4  2  3  5  7  3  3  2  3  1  5  3  2
  7  2  2  3  7  2  1  2  2  1  4  4  1  2  3  2  2  4  2  2  3  4  1  2
  4  3  2  1  7  7  2  5  3  4  1  3 21  1  6  4  9  4  3  2  7  1  4  1
  3  3  7  2  3  4  8  7  1  2  1  3  2  2  7  1  4  2  1  4  3  4  1  1
  4  1  4  3]


### Pickle the Pipeline and Export to the External Folder

In [86]:
from joblib import dump, load

# Pickle the Pipeline & export to the directory with the specified filename:
dump(best_model, "higher-order-elastic-net.joblib")

['higher-order-elastic-net.joblib']