In [1]:
!pip install optuna


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine


In [3]:
hotel = pd.read_excel('HotelFINALdataset.xlsx')

In [4]:
hotel.head()

Unnamed: 0,User_ID,travelCode,Hotel_Name,Arrival_place,Hotel_stay,Hotel_per_day_price,Check-in,Hotel_TotalPrice
0,0,0,Hotel A,Florianopolis (SC),4,313.02,09/26/2019,1252.08
1,0,2,Hotel K,Salvador (BH),2,263.41,10/10/2019,526.82
2,0,7,Hotel K,Salvador (BH),3,263.41,11/14/2019,790.23
3,0,11,Hotel K,Salvador (BH),4,263.41,12/12/2019,1053.64
4,0,13,Hotel A,Florianopolis (SC),1,313.02,12/26/2019,313.02


In [5]:
hotel['Hotel_Name'].nunique()

9

In [6]:
passengers = pd.read_excel('PassengerFINALdataset.xlsx')

In [7]:
passengers.head()

Unnamed: 0,User_ID,company,Name,gender_x
0,0,4You,Roy Braun,male
1,1,4You,Joseph Holsten,male
2,2,4You,Wilma Mcinnis,female
3,3,4You,Paula Daniel,female
4,4,4You,Patricia Carson,female


In [8]:
hotel = pd.merge(hotel,passengers,how='inner',on='User_ID')

In [9]:
hotel.head()

Unnamed: 0,User_ID,travelCode,Hotel_Name,Arrival_place,Hotel_stay,Hotel_per_day_price,Check-in,Hotel_TotalPrice,company,Name,gender_x
0,0,0,Hotel A,Florianopolis (SC),4,313.02,09/26/2019,1252.08,4You,Roy Braun,male
1,0,2,Hotel K,Salvador (BH),2,263.41,10/10/2019,526.82,4You,Roy Braun,male
2,0,7,Hotel K,Salvador (BH),3,263.41,11/14/2019,790.23,4You,Roy Braun,male
3,0,11,Hotel K,Salvador (BH),4,263.41,12/12/2019,1053.64,4You,Roy Braun,male
4,0,13,Hotel A,Florianopolis (SC),1,313.02,12/26/2019,313.02,4You,Roy Braun,male


In [10]:
hotel.drop(['User_ID','travelCode','Name'],axis=1,inplace=True)

In [11]:
hotel['Hotel_Check-in'] = pd.to_datetime(hotel['Check-in'])

In [12]:
hotel['Arrival_place'].unique()

array(['Florianopolis (SC)', 'Salvador (BH)', 'Natal (RN)',
       'Aracaju (SE)', 'Recife (PE)', 'Sao Paulo (SP)',
       'Campo Grande (MS)', 'Rio de Janeiro (RJ)', 'Brasilia (DF)'],
      dtype=object)

In [13]:
hotel.head()

Unnamed: 0,Hotel_Name,Arrival_place,Hotel_stay,Hotel_per_day_price,Check-in,Hotel_TotalPrice,company,gender_x,Hotel_Check-in
0,Hotel A,Florianopolis (SC),4,313.02,09/26/2019,1252.08,4You,male,2019-09-26
1,Hotel K,Salvador (BH),2,263.41,10/10/2019,526.82,4You,male,2019-10-10
2,Hotel K,Salvador (BH),3,263.41,11/14/2019,790.23,4You,male,2019-11-14
3,Hotel K,Salvador (BH),4,263.41,12/12/2019,1053.64,4You,male,2019-12-12
4,Hotel A,Florianopolis (SC),1,313.02,12/26/2019,313.02,4You,male,2019-12-26


In [14]:
hotel.corr()['Hotel_TotalPrice']

  hotel.corr()['Hotel_TotalPrice']


Hotel_stay             0.75285
Hotel_per_day_price    0.60273
Hotel_TotalPrice       1.00000
Name: Hotel_TotalPrice, dtype: float64

In [15]:
hotel["Weekend_Checkin"] = (hotel['Hotel_Check-in'].dt.weekday >= 5 ).astype(int)

In [16]:
hotel['Month_Checkin'] = hotel['Hotel_Check-in'].dt.month 

In [17]:
#hotel.drop(['company'],axis=1,inplace=True)

In [18]:
hotel.drop(['Check-in'],axis=1,inplace=True)

In [19]:
hotel.head()

Unnamed: 0,Hotel_Name,Arrival_place,Hotel_stay,Hotel_per_day_price,Hotel_TotalPrice,company,gender_x,Hotel_Check-in,Weekend_Checkin,Month_Checkin
0,Hotel A,Florianopolis (SC),4,313.02,1252.08,4You,male,2019-09-26,0,9
1,Hotel K,Salvador (BH),2,263.41,526.82,4You,male,2019-10-10,0,10
2,Hotel K,Salvador (BH),3,263.41,790.23,4You,male,2019-11-14,0,11
3,Hotel K,Salvador (BH),4,263.41,1053.64,4You,male,2019-12-12,0,12
4,Hotel A,Florianopolis (SC),1,313.02,313.02,4You,male,2019-12-26,0,12


In [20]:
hotel1 = hotel.copy(deep=True)

In [21]:
#hotel = hotel1.copy(deep=True)

In [23]:
X = hotel.drop('Hotel_TotalPrice',axis=1)
y = hotel['Hotel_TotalPrice']

In [24]:
#Preprocessing Pipeline Gemeration
num_features = X.select_dtypes(include=['int64','float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

In [25]:
num_transformer = Pipeline([('imputer',SimpleImputer(strategy='median')),
                           ('scaler',StandardScaler())])

In [26]:
cat_transformer = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                           ('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [27]:
preprocessor = ColumnTransformer([
    ('num',num_transformer,num_features),
    ('cat',cat_transformer,cat_features)
])

In [28]:
X_transformed = preprocessor.fit_transform(X)

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X_transformed,y,test_size=0.2,random_state=42)

In [30]:
# Hyperparameter Optimization
def objective(trial):
    model = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 500),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        subsample=trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)


In [31]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
best_params = study.best_params


[I 2025-02-11 11:57:31,358] A new study created in memory with name: no-name-0d89d95e-a05b-4517-a541-ff9130de5bd2
[I 2025-02-11 11:57:32,055] Trial 0 finished with value: 657.5165302182239 and parameters: {'n_estimators': 131, 'learning_rate': 0.028051147569337, 'max_depth': 3, 'subsample': 0.6140682130909718, 'colsample_bytree': 0.731316693517617}. Best is trial 0 with value: 657.5165302182239.
[I 2025-02-11 11:57:33,071] Trial 1 finished with value: 0.010804365222498087 and parameters: {'n_estimators': 230, 'learning_rate': 0.157705998669277, 'max_depth': 4, 'subsample': 0.5236129816836199, 'colsample_bytree': 0.7385905243673041}. Best is trial 1 with value: 0.010804365222498087.
[I 2025-02-11 11:57:34,885] Trial 2 finished with value: 0.02060471003788778 and parameters: {'n_estimators': 488, 'learning_rate': 0.19488992523438378, 'max_depth': 3, 'subsample': 0.5711489994830233, 'colsample_bytree': 0.5428478942717634}. Best is trial 1 with value: 0.010804365222498087.
[I 2025-02-11 11

[I 2025-02-11 11:58:17,416] Trial 28 finished with value: 8.066875830381752e-06 and parameters: {'n_estimators': 273, 'learning_rate': 0.18017207685228337, 'max_depth': 4, 'subsample': 0.8418666260080793, 'colsample_bytree': 0.9517219354579345}. Best is trial 27 with value: 1.803410968186297e-07.
[I 2025-02-11 11:58:18,090] Trial 29 finished with value: 5843.554959141459 and parameters: {'n_estimators': 136, 'learning_rate': 0.013432868865321843, 'max_depth': 3, 'subsample': 0.7150657744444087, 'colsample_bytree': 0.8962137595522722}. Best is trial 27 with value: 1.803410968186297e-07.


In [32]:
best_xgb = XGBRegressor(**best_params)
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)



In [33]:
stacked_model = StackingRegressor(
    estimators=[('xgb', best_xgb)],
    final_estimator=XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
)


In [34]:
stacked_model.fit(X_train, y_train)
y_pred_stack = stacked_model.predict(X_test)


In [35]:
def build_nn():
    model = Sequential([
        Dense(128, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn()
nn_callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
]
nn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32, verbose=1, callbacks=nn_callbacks)

y_pred_nn = nn_model.predict(X_test).flatten()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - loss: 70478.6797 - mae: 158.3644 - val_loss: 598.8766 - val_mae: 19.0012 - learning_rate: 0.0100
Epoch 2/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 6028.4458 - mae: 57.0828 - val_loss: 2019.0251 - val_mae: 38.5733 - learning_rate: 0.0100
Epoch 3/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 5071.7007 - mae: 52.6691 - val_loss: 271.0539 - val_mae: 12.7028 - learning_rate: 0.0100
Epoch 4/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 4138.8594 - mae: 47.5535 - val_loss: 585.5790 - val_mae: 19.8972 - learning_rate: 0.0100
Epoch 5/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 3854.3059 - mae: 46.0893 - val_loss: 373.4620 - val_mae: 16.0468 - learning_rate: 0.0100
Epoch 6/100
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [36]:
def evaluate_model(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    # classification__report = classification_report(y_true,y_pred)
    # confusion__matrix = confusion_matrix(y_true,y_pred)
    print(f"\n {name} Performance:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    
evaluate_model("Optimized XGBoost", y_test, y_pred_xgb)
evaluate_model("Stacking Model", y_test, y_pred_stack)
evaluate_model("Neural Network", y_test, y_pred_nn)


 Optimized XGBoost Performance:
R² Score: 1.0000
MAE: 0.0001
RMSE: 0.0002

 Stacking Model Performance:
R² Score: 0.9998
MAE: 2.4876
RMSE: 4.5378

 Neural Network Performance:
R² Score: 0.9974
MAE: 12.7028
RMSE: 16.3566


In [37]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LeakyReLU, Add, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import StandardScaler
import numpy as np



def build_nn():
    inputs = Input(shape=(X_train.shape[1],))
    x = Dense(128, kernel_regularizer=l2(0.001))(inputs)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    x = Dropout(0.2)(x)

    x1 = Dense(64, kernel_regularizer=l2(0.001))(x)
    x1 = BatchNormalization()(x1)
    x1 = LeakyReLU()(x1)
    x1 = Dropout(0.2)(x1)

    x2 = Dense(128, kernel_regularizer=l2(0.001))(x1)  # Ensuring same shape as `x`
    x2 = BatchNormalization()(x2)
    x2 = LeakyReLU()(x2)
    x2 = Dropout(0.2)(x2)

    # Residual connection (now both are (128,))
    x3 = Add()([x, x2])
    
    outputs = Dense(1)(x3)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=AdamW(learning_rate=0.005), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn()
nn_callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
]

nn_model.fit(X_train, y_train, validation_data=(X_test, y_test), 
             epochs=200, batch_size=32, verbose=1, callbacks=nn_callbacks)

y_pred_nn = nn_model.predict(X_test).flatten()


Epoch 1/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - loss: 138996.6094 - mae: 267.9879 - val_loss: 181.3343 - val_mae: 10.8859 - learning_rate: 0.0050
Epoch 2/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 4778.8984 - mae: 51.6830 - val_loss: 564.3837 - val_mae: 19.1160 - learning_rate: 0.0050
Epoch 3/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 4519.1265 - mae: 50.9700 - val_loss: 444.4127 - val_mae: 16.3198 - learning_rate: 0.0050
Epoch 4/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 4124.4863 - mae: 48.6092 - val_loss: 123.5970 - val_mae: 8.9888 - learning_rate: 0.0050
Epoch 5/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 3754.1365 - mae: 46.1966 - val_loss: 369.8632 - val_mae: 13.0330 - learning_rate: 0.0050
Epoch 6/200
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [38]:
evaluate_model("Neural Network", y_test, y_pred_nn)


 Neural Network Performance:
R² Score: 0.9995
MAE: 6.1394
RMSE: 7.0479


In [45]:
!pip install scikeras

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting scikeras
  Using cached scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Using cached scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [47]:
from sklearn.ensemble import StackingRegressor
from scikeras.wrappers import KerasRegressor  # Corrected import

def build_nn_wrapper():
    return build_nn()

nn_wrapper = KerasRegressor(build_fn=build_nn_wrapper, epochs=100, batch_size=32, verbose=0)

stacked_model = StackingRegressor(
    estimators=[('xgb', best_xgb), ('nn', nn_wrapper)],
    final_estimator=XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
)

stacked_model.fit(X_train, y_train)
y_pred_stacked = stacked_model.predict(X_test)



  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


In [48]:
evaluate_model("Sta", y_test, y_pred_stacked)


 Neural Network Performance:
R² Score: 0.9999
MAE: 2.3916
RMSE: 3.6301
