In [19]:
import os
from catboost import CatBoostRegressor
os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*feature names, but LGBMRegressor was fitted with feature names*")
from src.sibr_market_training import Train

In [3]:
dataset = 'rentals'
logger = Logger(f'train{dataset.capitalize()}')
bq = BigQuery(logger=logger,dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market')
save_to_gc = True
t = Train(dataset=dataset, logger = logger)

2025-08-02 21:53:37,046 - trainRentals - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/trainRentals.log
2025-08-02 21:53:37,050 - trainRentals - INFO - BigQuery client initialized with project_id: sibr-market
2025-08-02 21:53:37,052 - trainRentals - INFO - Google Cloud Storage client initialized with bucket: sibr-market
2025-08-02 21:53:37,055 - trainRentals - INFO - BigQuery client initialized with project_id: sibr-market
2025-08-02 21:53:37,057 - trainRentals - INFO - Google Cloud Storage client initialized with bucket: sibr-market
2025-08-02 21:53:37,058 - trainRentals - DEBUG - Dataset: rentals | | Replace: False


In [39]:
def read_query(dataset_name,limit : int = None,random_samples = None,last_scrape = False):
    if random_samples and last_scrape:
        raise ValueError("Only one of the following two can be True at the same time: random_samples and last_scrape")
    query = f"""
SELECT
  a.*,
  CASE
    WHEN COALESCE(p.n, 0) < 3 THEN m.rent_pr_sqm
    ELSE p.rent_pr_sqm
  END AS ref_rent_pr_sqm,
  CASE
    WHEN COALESCE(p.n, 0) < 3 THEN m.rent_pr_bedroom
    ELSE p.rent_pr_bedroom
  END AS ref_rent_pr_bedroom,
  c.lat,
  c.lng,
  s.* EXCEPT (Kommune, Kommunenr, `År`)
FROM `sibr-market.pre_processed.rentals` a
    JOIN clean.rentals h ON h.item_id = a.item_id
    JOIN admin.coordinates c ON c.item_id = a.item_id
    LEFT JOIN `sibr-market.api.{dataset_name}_postal` p ON p.postal_code = h.postal_code
    LEFT JOIN `sibr-market.api.{dataset_name}_municipality` m ON m.municipality = h.municipality
    LEFT JOIN `sibr-market.admin.SSB_municipality` s ON LOWER(s.Kommune) = LOWER(h.municipality)
"""
    if last_scrape:
        query += f"WHERE DATE(a.year, a.month, a.day) = (SELECT MAX(PARSE_DATE('%Y-%m-%d', scrape_date)) FROM sibr-market.raw.rentals) \n"
    if random_samples:
        query += f"WHERE RAND() < {random_samples}"
    if limit:
        query += f"LIMIT {limit}"
    #print(query)
    df = bq.read_bq(query)
    return df

def prep_data(df,unimportant_columns  = None,drop_hybel = False):
    if unimportant_columns:
        important_columns = df.columns.difference(unimportant_columns)
        df = df[important_columns].copy()
    if "item_id" in df.columns:
        df.set_index('item_id', inplace=True)
    # else:
    #     logger.warning(f"no item_id in dataframe {df.columns}")
    if 'primary_area' in df.columns:
        df['ref_rent'] = df.apply(lambda row: row['ref_rent_pr_sqm']* row['primary_area'], axis=1)
    elif 'bedrooms' in df.columns:
        df['ref_rent'] = df.apply(lambda row: row['ref_rent_pr_sqm']* row['bedrooms'], axis=1)
    else:
        logger.warning(f'Missing both primary_area and bedrooms in dataframe: {df.columns}')
    if drop_hybel:
        df = df[df["property_type_hybel"] != True]
        df.drop(columns= ["dealer_True"],errors="ignore",inplace = True)
    df.drop(columns=['pre_processed_date'], errors='ignore',inplace=True)
    return df

def read_data(dataset_name,limit : int = None,random_samples = None,unimportant_columns = None,last_scrape = False,drop_hybel = False):

    df = read_query(dataset_name=dataset_name,limit = limit,random_samples=random_samples,last_scrape=last_scrape)
    df = prep_data(df,unimportant_columns=unimportant_columns,drop_hybel = drop_hybel)
    return df

def read_data_oslo(query, unimportant_columns : list = [], limit = None, random_samples = None, last_scrape = False,drop_hybel = False):
  
  if random_samples and last_scrape:
          raise ValueError("Only one of the following two can be True at the same time: random_samples and last_scrape")
  query_oslo = query
  
  if last_scrape:
          query_oslo += f"WHERE DATE(a.year, a.month, a.day) = (SELECT MAX(PARSE_DATE('%Y-%m-%d', scrape_date)) FROM sibr-market.raw.rentals) \n "
  if random_samples:
      query_oslo += f"WHERE RAND() < {random_samples} \n"
  if limit:
      query_oslo += f"LIMIT {limit}"
  #print(query_oslo)
  df_o = bq.read_bq(query_oslo)
  df_o = prep_data(df_o,unimportant_columns=unimportant_columns,drop_hybel=drop_hybel)
  df_o = pd.get_dummies(data = df_o, columns = ['district_name'])
  return df_o

query_co_living = """
WITH OsloRentals AS (
  SELECT
    h.*,
    go.BYDELSNAVN AS district_geo  
  FROM `sibr-market.clean.rentals` h
  LEFT JOIN `sibr-market.admin.geo_oslo` go ON go.postnummer = h.postal_code
  WHERE LOWER(h.municipality) = 'oslo'
)
SELECT
  a.*,
  d.monthly_rent AS ref_rent,
  c.lat,
  c.lng,
  go.BYDELSNAVN AS district_name
FROM `sibr-market.pre_processed.rentals_co-living` a
JOIN OsloRentals h ON h.item_id = a.item_id
JOIN admin.coordinates c ON c.item_id = a.item_id
LEFT JOIN `sibr-market.api.rentals_co-living_oslo` d ON d.district_name = h.district_geo
LEFT JOIN `sibr-market.admin.geo_oslo` go ON go.postnummer = h.postal_code
"""

query_oslo = f"""
  WITH OsloRentals AS (
  SELECT
    h.*,
    go.BYDELSNAVN AS district_geo  
  FROM `sibr-market.clean.rentals` h
  LEFT JOIN `sibr-market.admin.geo_oslo` go ON go.postnummer = h.postal_code
  WHERE LOWER(h.municipality) = 'oslo'
)
SELECT
  a.*,
  d.rent_pr_sqm AS ref_rent_pr_sqm,
  d.rent_pr_bedroom AS ref_rent_pr_bedroom,
  c.lat,
  c.lng,
  go.BYDELSNAVN AS district_name
FROM `sibr-market.pre_processed.rentals` a
JOIN OsloRentals h ON h.item_id = a.item_id
JOIN admin.coordinates c ON c.item_id = a.item_id
LEFT JOIN `sibr-market.api.rentals_oslo` d ON d.district_name = h.district_geo
LEFT JOIN `sibr-market.admin.geo_oslo` go ON go.postnummer = h.postal_code
  """

In [40]:
df_a = read_data('rentals',unimportant_columns=[])
df_co = read_data_oslo(query=query_co_living,unimportant_columns=[])
df_o = read_data_oslo(query=query_oslo ,unimportant_columns=[])
t.logger.info(f"Length's of dataframes: \t rentals {len(df_a)}, rental_oslo {len(df_o)}, rental_co-living {len(df_co)}")

2025-08-02 22:42:54,898 - trainRentals - INFO - 125958 rows read from rentals. Query: 
SELECT
  a.*,
  CASE
    WHEN COALESCE(p.n, 0) < 3 THEN m.rent_pr_sqm
    ELSE p.rent_pr_sqm
  END ... (truncated)
2025-08-02 22:42:59,194 - trainRentals - INFO - 9666 rows read from rentals. Query: 
WITH OsloRentals AS (
  SELECT
    h.*,
    go.BYDELSNAVN AS district_geo  
  FROM `sibr-market.cle... (truncated)
       'eq_tv_True', 'eq_hot_water_True', 'eq_water_True', 'eq_heating_True',
       'eq_parking_True', 'eq_household_appliances_True', 'eq_furniture_True',
       'day', 'month', 'year', 'pre_processed_date', 'ref_rent', 'lat', 'lng',
       'district_name'],
      dtype='object')
2025-08-02 22:43:01,640 - trainRentals - INFO - 36421 rows read from rentals. Query: 
  WITH OsloRentals AS (
  SELECT
    h.*,
    go.BYDELSNAVN AS district_geo  
  FROM `sibr-market.c... (truncated)
2025-08-02 22:43:01,903 - trainRentals - INFO - Length's of dataframes: 	 rentals 125958, rental_oslo 36421, renta

In [41]:
params_rentals = {'model__depth': 7,
                  'model__iterations': 1283,
                  'model__l2_leaf_reg': np.float64(4.296367842118345),
                  'model__learning_rate': np.float64(0.031680214273730974),
                  'model__random_state': 98}
params_co = {'model__depth': 9,
             'model__iterations': 809,
             'model__l2_leaf_reg': np.float64(3.241509528627272),
             'model__learning_rate': np.float64(0.04135867955263894),
             'model__random_state': 36}
params_o = {'model__depth': 6,
            'model__iterations': 1151,
            'model__l2_leaf_reg': np.float64(1.6536379754545745),
            'model__learning_rate': np.float64(0.04123212300418546),
            'model__random_state': 61}

pipeline_rentals = t.train(df=df_a,
                              params=params_rentals,
                              target = 'monthly_rent' ,
                              data_name='rentals',
                              model=CatBoostRegressor,
                              save_to_gc=save_to_gc,
                              log_target=True)
pipeline_rental_oslo = t.train(df=df_o,
                                  params=params_o,
                                  target = 'monthly_rent',
                                  data_name='rentals_oslo',
                                  model=CatBoostRegressor,
                                  save_to_gc=save_to_gc,
                                  log_target = True)

pipeline_coliv = t.train(df=df_co,
                            params=params_co,
                            target = 'monthly_rent',
                            data_name='rentals_co-living',
                            model=CatBoostRegressor,
                            save_to_gc=save_to_gc,
                            log_target = True)


2025-08-02 22:43:13,589 - trainRentals - INFO - 
 
TRAINING CatBoostRegressor model for RENTALS
2025-08-02 22:43:13,647 - trainRentals - INFO - Train set size: 100766, Test set size: 25192
2025-08-02 22:43:13,649 - trainRentals - INFO - Target: monthly_rent and log_target: True


0:	learn: 0.4052329	total: 7.52ms	remaining: 9.64s
1:	learn: 0.3974898	total: 14.6ms	remaining: 9.34s
2:	learn: 0.3897851	total: 20.2ms	remaining: 8.6s
3:	learn: 0.3825563	total: 25.9ms	remaining: 8.3s
4:	learn: 0.3755198	total: 31.3ms	remaining: 8.01s
5:	learn: 0.3688182	total: 36.6ms	remaining: 7.79s
6:	learn: 0.3623265	total: 42.1ms	remaining: 7.67s
7:	learn: 0.3562402	total: 47.7ms	remaining: 7.6s
8:	learn: 0.3503759	total: 53.3ms	remaining: 7.54s
9:	learn: 0.3446964	total: 59.1ms	remaining: 7.52s
10:	learn: 0.3391318	total: 64.6ms	remaining: 7.46s
11:	learn: 0.3339400	total: 70.4ms	remaining: 7.46s
12:	learn: 0.3289900	total: 75.9ms	remaining: 7.41s
13:	learn: 0.3242995	total: 82ms	remaining: 7.43s
14:	learn: 0.3197783	total: 87.9ms	remaining: 7.43s
15:	learn: 0.3153255	total: 93.3ms	remaining: 7.39s
16:	learn: 0.3110801	total: 99.6ms	remaining: 7.42s
17:	learn: 0.3069007	total: 105ms	remaining: 7.39s
18:	learn: 0.3029996	total: 111ms	remaining: 7.38s
19:	learn: 0.2992350	total: 1

2025-08-02 22:43:21,134 - trainRentals - INFO - MSE test: 6963916.341650302,r2 test: 0.8217171352659431, mse train: 6444093.565109789, r2 train 0.8364554720555463 for rentals with target monthly_rent and log_target True
2025-08-02 22:43:21,337 - trainRentals - INFO - Blob models.json downloaded to /tmp/models.json.
2025-08-02 22:43:21,346 - trainRentals - INFO - Oppdaterer eksisterende modell 'rentals' i manifestet.
2025-08-02 22:43:21,642 - trainRentals - INFO - File /tmp/models.json uploaded to models.json in bucket sibr-market.
2025-08-02 22:43:22,102 - trainRentals - INFO - File /tmp/tmp_file.pkl uploaded to models/CatBoostRegressor_rentals.pkl in bucket sibr-market.
2025-08-02 22:43:22,118 - trainRentals - INFO - 
 
TRAINING CatBoostRegressor model for RENTALS_OSLO
2025-08-02 22:43:22,135 - trainRentals - INFO - Train set size: 29136, Test set size: 7285
2025-08-02 22:43:22,135 - trainRentals - INFO - Target: monthly_rent and log_target: True


0:	learn: 0.3491959	total: 1.94ms	remaining: 2.23s
1:	learn: 0.3397986	total: 4.09ms	remaining: 2.35s
2:	learn: 0.3310105	total: 6.11ms	remaining: 2.34s
3:	learn: 0.3227150	total: 8.04ms	remaining: 2.31s
4:	learn: 0.3149172	total: 9.9ms	remaining: 2.27s
5:	learn: 0.3075617	total: 11.8ms	remaining: 2.25s
6:	learn: 0.3004788	total: 13.7ms	remaining: 2.23s
7:	learn: 0.2937654	total: 16.3ms	remaining: 2.32s
8:	learn: 0.2874371	total: 18.1ms	remaining: 2.29s
9:	learn: 0.2814010	total: 19.9ms	remaining: 2.27s
10:	learn: 0.2758461	total: 21.9ms	remaining: 2.26s
11:	learn: 0.2702860	total: 23.8ms	remaining: 2.25s
12:	learn: 0.2651721	total: 25.6ms	remaining: 2.24s
13:	learn: 0.2605789	total: 27.5ms	remaining: 2.24s
14:	learn: 0.2557669	total: 29.5ms	remaining: 2.23s
15:	learn: 0.2514837	total: 31.4ms	remaining: 2.22s
16:	learn: 0.2475217	total: 33.3ms	remaining: 2.22s
17:	learn: 0.2434788	total: 35.1ms	remaining: 2.21s
18:	learn: 0.2398144	total: 37ms	remaining: 2.2s
19:	learn: 0.2362390	total

2025-08-02 22:43:24,571 - trainRentals - INFO - MSE test: 7442482.694775097,r2 test: 0.8454576941707557, mse train: 5516937.157525392, r2 train 0.8825802459664778 for rentals_oslo with target monthly_rent and log_target True


1117:	learn: 0.1280964	total: 2.18s	remaining: 64.4ms
1118:	learn: 0.1280815	total: 2.18s	remaining: 62.5ms
1119:	learn: 0.1280590	total: 2.19s	remaining: 60.5ms
1120:	learn: 0.1280486	total: 2.19s	remaining: 58.6ms
1121:	learn: 0.1280320	total: 2.19s	remaining: 56.6ms
1122:	learn: 0.1280166	total: 2.19s	remaining: 54.6ms
1123:	learn: 0.1280014	total: 2.19s	remaining: 52.7ms
1124:	learn: 0.1279620	total: 2.19s	remaining: 50.7ms
1125:	learn: 0.1279493	total: 2.2s	remaining: 48.8ms
1126:	learn: 0.1279312	total: 2.2s	remaining: 46.8ms
1127:	learn: 0.1279206	total: 2.2s	remaining: 44.9ms
1128:	learn: 0.1278978	total: 2.2s	remaining: 43ms
1129:	learn: 0.1278820	total: 2.21s	remaining: 41ms
1130:	learn: 0.1278728	total: 2.21s	remaining: 39ms
1131:	learn: 0.1278380	total: 2.21s	remaining: 37.1ms
1132:	learn: 0.1278173	total: 2.21s	remaining: 35.2ms
1133:	learn: 0.1278041	total: 2.21s	remaining: 33.2ms
1134:	learn: 0.1277818	total: 2.22s	remaining: 31.2ms
1135:	learn: 0.1277612	total: 2.22s	re

2025-08-02 22:43:24,712 - trainRentals - INFO - Blob models.json downloaded to /tmp/models.json.
2025-08-02 22:43:24,718 - trainRentals - INFO - Oppdaterer eksisterende modell 'rentals_oslo' i manifestet.
2025-08-02 22:43:24,833 - trainRentals - INFO - File /tmp/models.json uploaded to models.json in bucket sibr-market.
2025-08-02 22:43:25,185 - trainRentals - INFO - File /tmp/tmp_file.pkl uploaded to models/CatBoostRegressor_rentals_oslo.pkl in bucket sibr-market.
2025-08-02 22:43:25,191 - trainRentals - INFO - 
 
TRAINING CatBoostRegressor model for RENTALS_CO-LIVING
2025-08-02 22:43:25,199 - trainRentals - INFO - Train set size: 7732, Test set size: 1934
2025-08-02 22:43:25,200 - trainRentals - INFO - Target: monthly_rent and log_target: True


0:	learn: 0.2113289	total: 6.95ms	remaining: 5.62s
1:	learn: 0.2097661	total: 13.2ms	remaining: 5.34s
2:	learn: 0.2085330	total: 19.2ms	remaining: 5.15s
3:	learn: 0.2073784	total: 24.7ms	remaining: 4.97s
4:	learn: 0.2061640	total: 30.2ms	remaining: 4.86s
5:	learn: 0.2050392	total: 35.7ms	remaining: 4.78s
6:	learn: 0.2041421	total: 41.2ms	remaining: 4.72s
7:	learn: 0.2031191	total: 46.6ms	remaining: 4.67s
8:	learn: 0.2020963	total: 52.1ms	remaining: 4.63s
9:	learn: 0.2013189	total: 57.4ms	remaining: 4.58s
10:	learn: 0.2003813	total: 62.6ms	remaining: 4.54s
11:	learn: 0.1995856	total: 68ms	remaining: 4.51s
12:	learn: 0.1989084	total: 73.3ms	remaining: 4.49s
13:	learn: 0.1982221	total: 78.5ms	remaining: 4.46s
14:	learn: 0.1972895	total: 83.7ms	remaining: 4.43s
15:	learn: 0.1964460	total: 89ms	remaining: 4.41s
16:	learn: 0.1956504	total: 94.3ms	remaining: 4.39s
17:	learn: 0.1948403	total: 99.6ms	remaining: 4.38s
18:	learn: 0.1942487	total: 105ms	remaining: 4.37s
19:	learn: 0.1934933	total:

2025-08-02 22:43:29,582 - trainRentals - INFO - MSE test: 2175009.0441982867,r2 test: 0.40425013747517735, mse train: 1374209.0270799177, r2 train 0.6318844188707662 for rentals_co-living with target monthly_rent and log_target True
2025-08-02 22:43:29,698 - trainRentals - INFO - Blob models.json downloaded to /tmp/models.json.
2025-08-02 22:43:29,703 - trainRentals - INFO - Oppdaterer eksisterende modell 'rentals_co-living' i manifestet.


803:	learn: 0.1202969	total: 4.23s	remaining: 26.3ms
804:	learn: 0.1202668	total: 4.24s	remaining: 21.1ms
805:	learn: 0.1202058	total: 4.24s	remaining: 15.8ms
806:	learn: 0.1201106	total: 4.25s	remaining: 10.5ms
807:	learn: 0.1200766	total: 4.25s	remaining: 5.26ms
808:	learn: 0.1200289	total: 4.26s	remaining: 0us


2025-08-02 22:43:29,872 - trainRentals - INFO - File /tmp/models.json uploaded to models.json in bucket sibr-market.
2025-08-02 22:43:30,425 - trainRentals - INFO - File /tmp/tmp_file.pkl uploaded to models/CatBoostRegressor_rentals_co-living.pkl in bucket sibr-market.
