# Model Selection

In [1]:

import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,r2_score
from scipy.stats import randint, uniform,loguniform
import logging
from google.cloud import logging as cloud_logging_client 
from google.cloud.logging.handlers import CloudLoggingHandler 
from google.cloud import bigquery
gc = True
if not gc:
    import os
    os.chdir("../..")
    from examples.help_modules import BigQuery, ConfigReader, Logger

In [2]:
if not gc:
    bq = BigQuery(ConfigReader().get_config().get('bq_path'), logger)
    logger = Logger('ModelSelection_Homes')
else:
    CLOUD_LOG_NAME = "my_workbench_app_log"
    LOG_LEVEL = logging.INFO
    
    # --- Oppsett av Logger ---
    
    logger = logging.getLogger(__name__)
    logger.setLevel(LOG_LEVEL)
    
    # Fjerner eventuelle eksisterende handlere for å unngå duplikater (god praksis)
    if logger.handlers:
        for handler in logger.handlers[:]:
            logger.removeHandler(handler)
    
    # 1. Konsoll-handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    # 2. Google Cloud Logging Handler
    try:
        # Initialiser Cloud Logging klienten.
        # Bruk 'cloud_logging_client' aliaset for å unngå navnekollisjon
        logging_client = cloud_logging_client.Client()
        
        # Opprett en handler som sender logger til Google Cloud Logging
        cloud_handler = CloudLoggingHandler(logging_client, name=CLOUD_LOG_NAME)
        cloud_handler.setLevel(LOG_LEVEL)
        logger.addHandler(cloud_handler)
        
        logger.info("Cloud Logging er konfigurert og aktivert.")
    
    except Exception as e:
        logger.error(f"Kunne ikke konfigurere Google Cloud Logging. Logger kun til konsoll. Feil: {e}")


sql_a = '''
SELECT a.*,c.lat,c.lng FROM `sibr-market.pre_processed.rentals` a
JOIN admin.coordinates c ON c.item_id = a.item_id
WHERE c.lat != 0
ORDER BY RAND()
LIMIT 5000;
'''
sql_h = '''
SELECT h.*,c.lat,c.lng FROM `sibr-market.pre_processed.rentals_homes` h
JOIN admin.coordinates c ON c.item_id = h.item_id
WHERE c.lat != 0
ORDER BY RAND()
LIMIT 5000;
'''
if gc:
    try:
        client = bigquery.Client()
        job_a = client.query(sql_a)
        job_h = client.query(sql_h)
        df_a,df_h = job_a.to_dataframe(),job_h.to_dataframe()
        print(f'lenght of dataframe a: {len(df_a)}')
        print(f'lenght of dataframe h: {len(df_h)}')
    except:
        print('Could not initiate data')
else:    
    df_a = bq.read_bq(sql_a,read_type="pandas_gbq")
    df_h = bq.read_bq(sql_h,read_type="pandas_gbq")

df_a.set_index('item_id', inplace=True)
df_h.set_index('item_id', inplace=True)

2025-06-08 12:40:25,311 - __main__ - INFO - Cloud Logging er konfigurert og aktivert.


lenght of dataframe a: 5000
lenght of dataframe h: 5000


In [3]:
params_rf = {
    # RandomForest har allerede 5 parametere, noe som er innenfor grensen.
    # Disse er generelt ansett som viktige for Random Forest.
    'model__n_estimators': randint(100, 1500),  # Litt redusert øvre grense for raskere søk
    'model__max_depth': randint(5, 50),        # Redusert øvre grense
    'model__min_samples_leaf': randint(1, 10),
    'model__bootstrap': [True, False],
    'model__random_state': randint(0, 100)     # Inkludert som i din originale kode
}

params_xgb = {
    # Velger 4 kjerne-ytelsesparametere + random_state (totalt 5)
    'model__n_estimators': randint(100, 1500),
    'model__learning_rate': loguniform(0.01, 0.3),
    'model__max_depth': randint(3, 15),           # Litt justert range
    'model__subsample': uniform(0.6, 0.4),        # Genererer fra 0.6 til 1.0
    # 'model__colsample_bytree': uniform(0.6, 0.4), # Kan legges til hvis du vil ha 5 ytelsesparams + random_state
    # 'model__reg_lambda': loguniform(1e-8, 1.0),   # Alternativ til subsample/colsample
    'model__random_state': randint(0, 100)
}

params_cat = {
    # Velger 4 kjerne-ytelsesparametere. random_state er fast.
    'model__iterations': randint(100, 1500),      # Samme som n_estimators
    'model__learning_rate': loguniform(0.01, 0.3),
    'model__depth': randint(3, 10),               # Samme som max_depth
    'model__l2_leaf_reg': loguniform(1, 10),      # Viktig regulariseringsparameter for CatBoost
    # 'model__bagging_temperature': uniform(0, 1), # Kan legges til for en femte parameter
    'model__random_state': randint(0, 100)                     # Fast verdi, teller ikke mot justerbare
}

models = {
    'RandomForest': (RandomForestRegressor(), params_rf),
    'XGBoost': (XGBRegressor(), params_xgb),
    'CatBoost': (CatBoostRegressor(silent=True), params_cat),
}

In [4]:
def model_selection(df_sample,models:dict,target ,dataset_name):
    logger.info(f'MODEL SELECTION FOR {dataset_name.upper()} \n \n')
    X_train, X_test, y_train, y_test = train_test_split(
        df_sample.drop(columns=[target], axis=1),
        df_sample[target],
        test_size=0.2,
        # stratify=y_binned,
        random_state=42)
    logger.info(f"Train set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")
    logger.info(f'Columns in train set: {X_train.columns.tolist()}')


    for model_name, (model, params) in models.items():
        logger.info(f'Training {model_name} model with hyperparameter tuning...')
        pipe = Pipeline([
            ('impute', SimpleImputer()),
            ('scaler', StandardScaler()),
            ('model', model),
        ])
        cv = RandomizedSearchCV(pipe,
                                    param_distributions=params,
                                    n_iter=50,
                                    cv=3,
                                    scoring='neg_mean_squared_error',
                                    verbose=1,
                                    random_state=42,
                                    n_jobs=-1,
                                refit = 'neg_mean_squared_error',
                                return_train_score=True)
        try:
            cv.fit(X_train, y_train)
            y_pred = cv.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            logger.info(f'{model_name} model best parameters: {cv.best_params_} on {dataset_name}')
            logger.info(f'Best score for {model_name}: Train: {-cv.best_score_}, Test: {mse}, R2 (test): {r2} on {dataset_name}')
        except Exception as e:
            logger.error(f'Error training {model_name} model: {e}')
    return cv.best_estimator_, cv.best_params_

In [5]:
best_model_a,best_params_a = model_selection(df_a, models,'monthly_rent', 'rentals')
best_model_h,best_params_h = model_selection(df_h, models, 'monthly_rent', 'rentals_homes')
print(f'Best params for rentals: {best_params_a}')
print(f'Best params for monthly: {best_params_h}')

2025-06-08 12:40:52,888 - __main__ - INFO - MODEL SELECTION FOR RENTALS 
 

2025-06-08 12:40:52,923 - __main__ - INFO - Train set size: 4000, Test set size: 1000
2025-06-08 12:40:52,925 - __main__ - INFO - Columns in train set: ['bedrooms', 'floor', 'usable_area', 'internal_area', 'gross_area', 'primary_area', 'sqm_pr_bedroom', 'dealer_True', 'property_type_Hybel', 'property_type_Leilighet', 'property_type_Rekkehus', 'property_type_Rom i bofellesskap', 'property_type_Tomannsbolig', 'day', 'month', 'year', 'lat', 'lng']
2025-06-08 12:40:52,926 - __main__ - INFO - Training RandomForest model with hyperparameter tuning...


Fitting 3 folds for each of 50 candidates, totalling 150 fits


I0000 00:00:1749386452.945328   14692 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


KeyboardInterrupt: 

In [9]:
df_a.head()

Unnamed: 0_level_0,monthly_rent,bedrooms,floor,usable_area,primary_area,sqm_pr_bedroom,dealer_True,day,month,year,property_type_Hybel,property_type_Leilighet,property_type_Rekkehus,property_type_Rom i bofellesskap,property_type_Tomannsbolig,lat,lng
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
401881293,13000.0,2.0,1.0,66.0,66.0,33.0,False,5,5,2025,False,True,False,False,False,62.47063,6.13275
383304988,12000.0,1.0,0.0,33.0,33.0,33.0,True,1,9,2024,False,True,False,False,False,59.95766,10.79132
369117863,90000.0,20.0,10.0,750.0,750.0,37.5,False,1,9,2024,False,True,False,False,False,59.41794,5.40878
397427951,6000.0,1.0,0.0,120.0,10.0,10.0,False,16,3,2025,False,False,False,True,False,59.8451,10.82627
369129316,9000.0,1.0,0.0,650.0,650.0,650.0,False,1,9,2024,False,True,False,False,False,59.25377,11.1801


In [4]:
df_a.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
monthly_rent,5000.0,33218.3478,48307.209021,1234.0,11700.0,15500.0,22000.0,295000.0
bedrooms,5000.0,4.826,7.140807,0.0,1.0,2.0,4.0,120.0
floor,4096.0,37.321289,1718.650451,0.0,2.0,3.0,10.0,110000.0
usable_area,5000.0,235.3226,253.863865,1.0,51.0,86.0,420.0,980.0
primary_area,5000.0,221.999,248.712788,1.0,47.0,80.0,400.0,980.0
sqm_pr_bedroom,5000.0,66.384905,109.823904,0.0,26.5,36.0,49.0,900.0
day,5000.0,6.8008,8.787569,1.0,1.0,1.0,10.0,31.0
month,5000.0,6.6714,2.817483,1.0,4.0,9.0,9.0,9.0
year,5000.0,2024.4414,0.496604,2024.0,2024.0,2024.0,2025.0,2025.0
lat,5000.0,60.395645,1.795015,43.1609,59.81391,59.926426,60.19183,69.773632


In [11]:
import re
def extract_int(x: str) -> int | None:
    """
    Trekker ut det første tallet fra en streng og konverterer det til et heltall.
    Håndterer desimaltall korrekt (f.eks. "2.0" blir 2).
    """
    # Finner den første sekvensen som ser ut som et tall (inkludert punktum)
    treff = re.search(r'[\d.]+', x)

    if treff:
        nummer_str = treff.group(0)
        try:
            # 1. Konverter til float for å håndtere desimaler
            nummer_float = float(nummer_str)
            # 2. Konverter floaten til et heltall (kutter desimalene)
            return int(nummer_float)
        except ValueError:
            # Håndterer tilfeller der strengen ikke er et gyldig tall (f.eks. "1.2.3")
            return None

    return None

In [12]:
extract_int(f'2.0 mnd')

2

In [14]:
extract_int(f'10.000 kr pr mnd')

10