In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('processed_data.csv')

# Define feature columns and target column
feature_columns = [col for col in data.columns if col not in ['datetime', 'price_log', 'price_boxcox', 'timezone']]
target_column = 'price_log'

# Separate features and target
X = data[feature_columns]
y = data[target_column]

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()

# Preprocessing for numerical data: standard scaling
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model (Random Forest Regressor)
model = RandomForestRegressor(random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Define the parameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_features': ['sqrt', 'log2', None],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error', 
                           error_score='raise', verbose=3)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit GridSearchCV
try:
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and the best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    # Evaluate the best model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
    explained_variance = explained_variance_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Print the best parameters and evaluation metrics
    print('Best Model Performance:')
    print(f'Best Parameters: {best_params}')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R-squared (R²) Score: {r2}')
    print(f'Adjusted R-squared: {adjusted_r2}')
    print(f'Explained Variance Score: {explained_variance}')
    print(f'Mean Absolute Percentage Error (MAPE): {mape}')

except Exception as e:
    print(f"Error during GridSearchCV fitting: {e}")

# Additional debug information
print("GridSearchCV fit status:")
print(grid_search.cv_results_)


Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Model Performance:
Best Parameters: {'model__max_depth': None, 'model__max_features': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 300}
Mean Squared Error (MSE): 0.00029447057518669387
R-squared (R²) Score: 0.9986582150807657
Adjusted R-squared: 0.9926947265508355
Explained Variance Score: 0.9986937544045303
Mean Absolute Percentage Error (MAPE): 0.0038947245056885956
GridSearchCV fit status:
{'mean_fit_time': array([2.48362271, 2.69035339, 2.24884407, 0.28893598, 0.56621337,
       0.96648804, 0.28375618, 0.68765537, 0.86951232, 0.27651747,
       0.55543359, 0.84364883, 0.29697331, 0.56720249, 0.81909871,
       0.29118212, 0.52647066, 0.78855038, 0.29281354, 0.56304669,
       0.82903409, 0.29164791, 0.53358976, 0.79620655, 0.28567108,
       0.54871853, 0.81501357, 0.31685686, 0.61166795, 0.90443667,
       0.31456168, 0.55400991, 0.86729956, 0.2909197 , 0.6378328 ,
    