# Introduction to Classification & Regression Trees

### Building Decision Trees for Regression & Classification Tasks

<b>Ryan Paul Lafler, M.Sc.</b>

In [32]:
%%time
# ######################################################### #
#             IMPORT REQUIRED CORE DEPENDENCIES             #
# ######################################################### #
import pandas as pd
import numpy as np
import scipy

from joblib import dump, load

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor as DTR

from sklearn.metrics import mean_squared_error, r2_score

CPU times: user 161 µs, sys: 122 µs, total: 283 µs
Wall time: 286 µs


### Define (or Import) the Custom Group Aggregation Transformer Class

In [23]:
class Aggregate_Categories(BaseEstimator, TransformerMixin) :

    # Initialization Method
    def __init__(self, columns, prop=0.005, value="other") :
        # Class remembers these properties when called
        self.columns = columns
        self.value = value
        self.prop = prop


    # Fit Method for Scikit-Learn
    def fit(self, X, y=None) :
        return self


    # Transform Method for Scikit-Learn
    def transform(self, X, y=None) :
        column_names = X.columns.values.tolist()  # Retrieve all column names & return them as a LIST

        # Transforming small categories into a larger, single category:
        for column in column_names:
            X[column] = X[column].astype(str) ## Coercion to String
            groups = X[column].value_counts() / X[column].value_counts().sum()  # Calculate relative frequency
            # DataFrame Series --> Column of the DataFrame
            small_groups = groups[groups < self.prop]  # Relative frequency LESS THAN 0.5%
            small_groups = small_groups.index.tolist()  # Convert category names to a list
            X[column] = pd.Categorical(
                X[column].replace(
                    to_replace=small_groups,
                    value=self.value,
                )
            )
        return X  # Return categorical column with aggregated groups

### Import the Revised CSV DataFrame

In [26]:
# Import the edited CSV DataFrame, containing predictors & the engineered target feature:
hotel_df = pd.read_csv(
    "hotel_bookings_revised_NEW.csv"
)
hotel_df = hotel_df.drop(
    columns=["Unnamed: 0"]
)  # Drops the additional index column

# Seperate the set of features from teh target feature
X = hotel_df.loc[:, ~hotel_df.columns.isin(["num_nights"])]
y = hotel_df.loc[:, ["num_nights"]]

# Create the Training / Testing Split; Reserve 20% of Data for Testing:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=7, test_size=0.2, shuffle=True,
)

# View the first-5 observations from the DataFrame
X_train.head(5)

Unnamed: 0,lead_time,arrival_date_week_number,adults,children,babies,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,...,country,market_segment,distribution_channel,reserved_room_type,deposit_type,customer_type,arrival_date_year,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled
3063,225,22,1,0.0,0,2,0,66.0,1,0,...,GBR,Groups,Direct,A,No Deposit,Transient-Party,2016,0,0,0
16722,0,40,2,0.0,0,0,0,90.0,0,0,...,FRA,Online TA,TA/TO,D,No Deposit,Transient,2016,1,0,0
40156,2,10,1,0.0,0,0,0,55.0,0,0,...,PRT,Offline TA/TO,TA/TO,A,No Deposit,Transient-Party,2016,0,0,0
19333,92,49,2,0.0,0,1,0,34.0,0,2,...,GBR,Offline TA/TO,TA/TO,A,No Deposit,Transient,2016,0,0,0
31075,34,39,2,0.0,0,0,33,224.67,0,0,...,ESP,Offline TA/TO,TA/TO,A,No Deposit,Transient-Party,2015,0,0,0


In [33]:
preprocessing_pipe = load("preprocessing_pipe_hotels.joblib")
preprocessing_pipe