In [None]:
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Define custom transformers
class MonthsTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        month_mapping = {
            'January': 1,
            'February': 2,
            'March': 3,
            'April': 4,
            'May': 5,
            'June': 6,
            'July': 7,
            'August': 8,
            'September': 9,
            'October': 10,
            'November': 11,
            'December': 12
        }
        X = X.copy()  # Ensure we do not modify the original DataFrame
        X['arrival_date_month'] = X['arrival_date_month'].str.strip()
        X['arrival_date_month'] = X['arrival_date_month'].map(month_mapping)
        return X

class CountryTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.encoder = LabelEncoder()
        self.encoder.fit(X.squeeze())
        return self

    def transform(self, X, y=None):
        X = X.copy()  # Ensure we do not modify the original DataFrame
        # Handle unseen countries by assigning a value of -1
        X['country'] = X['country'].apply(lambda x: self.encoder.transform([x])[0] if x in self.encoder.classes_ else -1)
        return X

# Load data
data_path = os.path.join('..', 'raw_data', 'hotel_bookings_raw.csv')
df = pd.read_csv(data_path)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define predictors and target
predictors = ['country', 'FUEL_PRCS', 'lead_time', 'adr', 'arrival_date_month', 'stays_in_week_nights', 'INFLATION']
target_variable = 'is_canceled'

X = df[predictors]
y = df[target_variable]

# Define feature lists for transformers
features_country = ["country"]
features_months = ["arrival_date_month"]
features_to_robust = ['lead_time', 'adr', 'stays_in_week_nights', 'FUEL_PRCS']
features_to_minmax = ['INFLATION']

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        (
            "country",
            Pipeline(steps=[
                ("country_mapping", CountryTransformer()),
                ("scaler", MinMaxScaler())
            ]),
            features_country
        ),
        (
            "arrival_date_month",
            Pipeline(steps=[
                ("arrival_date_month_mapping", MonthsTransformer()),
                ("scaler", RobustScaler())
            ]),
            features_months
        ),
        (
            "robust",
            Pipeline(steps=[("scaler", RobustScaler())]),
            features_to_robust
        ),
        (
            "minmax",
            Pipeline(steps=[("scaler", MinMaxScaler())]),
            features_to_minmax
        ),
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

# Transform the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(acc, rec, f1, mse)