In [1]:
import os
from prediction_commonfunctions import *
os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage
from src.sibr_market_training import Predict
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [2]:
dataset = 'homes'
logger = Logger(f'predict{dataset.capitalize()}')
bq = BigQuery(logger=logger, dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market')
save_to_bq = True
p = Predict(dataset=dataset, logger = logger)

2025-08-10 16:38:35,667 - predictHomes - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/predictHomes.log
2025-08-10 16:38:35,675 - predictHomes - INFO - BigQuery client initialized with project_id: sibr-market
2025-08-10 16:38:35,677 - predictHomes - INFO - Google Cloud Storage client initialized with bucket: sibr-market
2025-08-10 16:38:35,680 - predictHomes - INFO - BigQuery client initialized with project_id: sibr-market
2025-08-10 16:38:35,682 - predictHomes - INFO - Google Cloud Storage client initialized with bucket: sibr-market
2025-08-10 16:38:35,683 - predictHomes - DEBUG - Dataset: homes | | Replace: False


In [3]:
models = p.cs.download('models.json', read_in_file=True)
models['created_at'] = pd.to_datetime(models['created_at'], unit='ms')

2025-08-04 12:28:26,527 - predictHomes - INFO - Read in models.json


In [30]:
data = p.bq.read_homes(task = "predict")
df_a = data.get("homes_apartments")
df_h = data.get("homes_houses")
df_o = data.get("homes_apartments_oslo")
df_r = data.get("homes_rentals_oslo")

add_columns = [
    'power_True',
    'internet_True',
    'tv_True',
    'fiber_True',
    'hot_water_True',
    'heating_True',
    'parking_True'
]
for col in add_columns:
    if col == 'eq_internet_True':
        df_r[col] = True

2025-08-03 14:41:57,252 - predictHomes - INFO - 7367 rows read from homes. Query: 
            SELECT
            a.*,
            CASE
                WHEN COALESCE(p.n, 0) < 3 THEN... (truncated)
2025-08-03 14:41:59,635 - predictHomes - INFO - 8265 rows read from homes. Query: 
            SELECT
            a.*,
            CASE
                WHEN COALESCE(p.n, 0) < 3 THEN... (truncated)
2025-08-03 14:42:01,915 - predictHomes - INFO - 1196 rows read from homes. Query: 
            WITH OsloHomes AS (
                SELECT
                h.*,
                go.BYDE... (truncated)
2025-08-03 14:42:04,059 - predictHomes - INFO - 1373 rows read from homes. Query: 
                        WITH OsloHomesRentals AS (
                        SELECT
                 ... (truncated)


In [90]:
res_a = models[models['dataset'] == 'homes_apartments'].iloc[0].to_dict()
res_h = models[models['dataset'] == 'homes_houses'].iloc[0].to_dict()
res_o = models[models['dataset'] == 'homes_apartments_oslo'].iloc[0].to_dict()
res_r = models[models['dataset'] == 'rentals_oslo'].iloc[0].to_dict()

m_a = p.cs.download(res_a.get('filename'), read_in_file=True)
m_h = p.cs.download(res_h.get('filename'), read_in_file=True)
m_o = p.cs.download(res_o.get('filename'), read_in_file=True)
m_r = p.cs.download(res_r.get('filename'), read_in_file=True)

2025-08-04 14:10:19,030 - predictHomes - INFO - Read in XGBRegressor_homes_apartments.pkl
2025-08-04 14:10:19,301 - predictHomes - INFO - Read in CatBoostRegressor_homes_houses.pkl
2025-08-04 14:10:19,694 - predictHomes - INFO - Read in XGBRegressor_homes_apartments_oslo.pkl
2025-08-04 14:10:20,505 - predictHomes - INFO - Read in CatBoostRegressor_rentals_oslo.pkl


In [24]:
y_pred_a = p.predict_data(dataframe = df_a,
                                pipeline = m_a,
                                model_results=res_a,
                                )
y_pred_o = p.predict_data(df_o,
                                pipeline = m_o,
                                model_results=res_o,
                                )
df_r = p.ensure_columns(df = df_r,
                            training_columns=list(res_r.get('training_columns').keys()),
                            data_name='rentals_oslo'
                            )
y_pred_r = np.log1p(m_r.predict(df_r)) if res_r.get('log_target') else m_r.predict(df_r)


AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
df_a_rehab = df_a[df_a['fixer_upper_True'] == True].copy()
df_a_rehab['fixer_upper_True'] = False
y_pred_rehab = p.predict_data(dataframe=df_a_rehab,
                                    pipeline = m_a,
                                    model_results=res_a,
                                    )
df_o_rehab = df_o[df_o['fixer_upper_True'] == True].copy()
df_o_rehab['fixer_upper_True'] = False
y_pred_rehab_o = p.predict_data(dataframe = df_o_rehab,
                                    pipeline=m_o,
                                    model_results=res_o,
                                    )

In [None]:
pred_a = pd.DataFrame({
    'item_id': df_a.index,
    'predicted_price': y_pred_a,
    'model': 'apartments'
})

pred_o = pd.DataFrame({
    'item_id': df_o.index,
    'predicted_price': y_pred_o,
    'model': 'apartments_oslo'
})
pred_r = pd.DataFrame({
    'item_id': df_r.index,
    'predicted_price': y_pred_r,
    'model': 'rentals_oslo'
})
pred_a_rehab = pd.DataFrame({'item_id': df_a_rehab.index,
                                'predicted_price': y_pred_rehab,
                                'model': 'homes_rehab'})
pred_o_rehab = pd.DataFrame({'item_id': df_o_rehab.index,
                                'predicted_price': y_pred_rehab_o,
                                'model': 'homes_rehab_oslo'})
pred = pd.concat([pred_a, pred_o, pred_a_rehab,pred_o_rehab])
pred['predict_date'] = pd.Timestamp.now()
pred_r['predict_date'] = pd.Timestamp.now()

2025-08-03 13:39:40,139 - predictHomes - INFO - 7367 rows read from homes. Query: 
            SELECT
            a.*,
            CASE
                WHEN COALESCE(p.n, 0) < 3 THEN... (truncated)
2025-08-03 13:39:42,373 - predictHomes - INFO - 8265 rows read from homes. Query: 
            SELECT
            a.*,
            CASE
                WHEN COALESCE(p.n, 0) < 3 THEN... (truncated)
2025-08-03 13:39:44,613 - predictHomes - INFO - 1196 rows read from homes. Query: 
            WITH OsloHomes AS (
                SELECT
                h.*,
                go.BYDE... (truncated)
2025-08-03 13:39:46,678 - predictHomes - INFO - 1373 rows read from homes. Query: 
                        WITH OsloHomesRentals AS (
                        SELECT
                 ... (truncated)


In [None]:
if save_to_bq:
    if not pred.empty:
        p.save_data(df=pred, table_name=p.dataset)
    if not pred_r.empty:
        p.save_data(df=pred_r, table_name='homes_rentals')
else:
    p.logger.warning('No data saved to BQ as save_to_bq is set to False.')

Unnamed: 0,item_id,bedrooms,floor,primary_area,sqm_pr_bedroom,day,month,year,property_type_enebolig,property_type_leilighet,property_type_rekkehus,property_type_tomannsbolig,ref_rent_pr_sqm,ref_rent_pr_bedroom,lat,lng,district_name
0,413072938,0,1,41,41,3,8,2025,False,True,False,False,5203.127630,172388.217277,59.917858,10.762587,Gamle Oslo
1,418182120,2,4,75,37,3,8,2025,False,True,False,False,3989.895404,132564.496758,59.914288,10.813805,Alna
2,388030969,2,3,86,43,3,8,2025,False,True,False,False,,,59.054120,10.124530,
3,409142115,1,3,67,67,3,8,2025,False,True,False,False,5655.046336,183018.102866,59.919815,10.716986,Frogner
4,417928179,2,2,84,42,3,8,2025,False,True,False,False,5139.469651,154541.888797,59.951590,10.735561,Nordre Aker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,415345787,0,1,28,28,3,8,2025,False,True,False,False,5484.733923,165773.322431,59.929903,10.731984,St. Hanshaugen
1369,401839644,1,2,54,54,3,8,2025,False,True,False,False,5655.046336,183018.102866,59.926423,10.714139,Frogner
1370,409786264,2,2,89,44,3,8,2025,False,True,False,False,5484.733923,165773.322431,59.934265,10.744435,St. Hanshaugen
1371,418117769,2,3,81,40,3,8,2025,False,True,False,False,3989.895404,132564.496758,59.919919,10.815812,Alna


## CUSTOM PREDICTION

In [87]:
def get_api(postal_code) -> dict:
    base = """
    SELECT
        price_pr_i_sqm AS ref_price_pr_i_sqm,
        monthly_common_cost_pr_sqm AS ref_monthly_common_cost_pr_sqm,
        salgstid AS ref_salgstid,
        * EXCEPT (price_pr_i_sqm,monthly_common_cost_pr_sqm,n,salgstid,internal_area, usable_area,price,price_pr_sqm)
    """

    query =base +  f"""
    FROM api.homes_apartments_postal
    WHERE postal_code = '{postal_code}'
    """
    data = bq.read_bq(query=query)
    if not data.empty:
        return data.iloc[0].to_dict()
    else:
        logger.info("No data from postal codes, trying municipality")
        query = base +  f"""
    FROM api.homes_apartments_municipality
    WHERE municipality = (SELECT municipality FROM admin.geo_norge WHERE postal_code = '{postal_code}')
    """
        data = bq.read_bq(query=query)
        if not data.empty:
            return data.iloc[0].to_dict()
        else:
            logger.error("No data from municipality")
            raise Exception("No data from municipality")

def get_ssb(postal_code) -> dict:
    query = f"""SELECT * EXCEPT (Kommunenr,Kommune,`År`)
    FROM admin.SSB_municipality
    WHERE LOWER(Kommune) = LOWER((SELECT municipality FROM admin.geo_norge WHERE postal_code = '{postal_code}'))
     """
    data = bq.read_bq(query=query)
    if not data.empty:
        return data.iloc[0].to_dict()
    else:
        logger.error(f'No data from SSB on postal code {postal_code}')
        raise Exception("No data from SSB")

def get_date() -> dict:
    data = {"year" : pd.Timestamp.now().year,
            "month" : pd.Timestamp.now().month,
            "day" : pd.Timestamp.now().day,}
    return data

def get_base_apartments() -> dict:
    data = {'dealer_True': True,
 'fixer_upper_True': False,
 'eq_parking_True': False,
 'eq_lift_True': False,
 'eq_fireplace_True': False,
 'eq_charging_possibility_True': False,
 'eq_aircondition_True': False,
 'eq_garden_True': False,
 'eq_parking_tmp_True': True,
 'eq_lift_tmp_True': False,
 'eq_west_facing_True': False,
 'ownership_type_andel': False,
 'ownership_type_annet': False,
 'ownership_type_eier ': True,
 'ownership_type_obligasjon': False,}
    return data

def get_base_houses() -> dict:
    data = {'dealer_True': True,
 'fixer_upper_True': False,
 'eq_parking_True': False,
 'eq_lift_True': False,
 'eq_fireplace_True': False,
 'eq_charging_possibility_True': False,
 'eq_aircondition_True': False,
 'eq_garden_True': False,
 'eq_parking_tmp_True': True,
 'eq_lift_tmp_True': False,
 'eq_west_facing_True': False,
 'ownership_type_andel': False,
 'ownership_type_annet': False,
 'ownership_type_eier ': True,
 'ownership_type_obligasjon': False,
 'property_type_bygård_flermannsbolig': False,
 'property_type_enebolig': True,
 'property_type_gårdsbruk_småbruk': False,
 'property_type_rekkehus': False,
 'property_type_tomannsbolig': False}
    return data

def get_geo(address) -> dict:
    geo = Nominatim(user_agent="predict_homes")
    try:
        coor = geo.geocode(address)
        if coor:
            data = {"lat" : coor.latitude,
                    "lng" : coor.longitude,}

            postal_code = (coor.raw.get("display_name")).split(",")[-2].strip()
            if len(postal_code) == 4 and isinstance(int(postal_code), int):
                data["postal_code"] = postal_code
            else:
                for i in coor.raw.get("display_name").split(","):
                    if len(i.strip()) == 4 and isinstance(int(i.strip()), int):
                        data["postal_code"] = i.strip()
            return data
        else:
            logger.error('No data from address')
            raise Exception("No data from address")
    except Exception as e:
        logger.error(e)

In [88]:
def build_data(user_input : dict) -> dict:
    geo = get_geo(user_input.get("address"))
    base = get_base()
    date = get_date()
    ssb = get_ssb(geo.get("postal_code"))
    api = get_api(geo.get("postal_code"))
    data = geo | base | date | ssb | api | user_input
    data.pop("postal_code")
    data.pop("address")
    return data
def prep_data(data : dict,columns: list) -> pd.DataFrame:
    df = pd.Series(data = data,
                  index = columns
                  )
    if pd.isna(df["internal_area"]) and not pd.isna(df["usable_area"]):
        df["internal_area"] = df["usable_area"] - df["external_area"]
    if pd.isna(df["usable_area"]) and not pd.isna(df["internal_area"]):
        df["usable_area"] = df["internal_area"] + df["external_area"]
    df['ref_price'] = df['ref_price_pr_i_sqm'] * df['internal_area'] if 'internal_area' in df.index else df['ref_price_pr_i_sqm'] * df['usable_area']
    df["sqm_pr_bedroom"] = df["usable_area"] / df["bedrooms"]
    df["rooms"] = df["bedrooms"] + 1
    return df[columns].to_frame().T

def run(user_input : dict,model : sklearn.pipeline.Pipeline,log_target = True):
    if "usable_area" not in user_input and "internal_area" not in user_input:
        raise ValueError("One of the following is required: Internal area or internal area")
    if "address" not in user_input:
        raise ValueError("Address is required")
    data = build_data(user_input)
    columns = list(res_a.get("training_columns").keys())
    data_ready = prep_data(data,columns)
    return np.expm1(model.predict(data_ready)) if log_target else model.predict(data_ready)

In [89]:
user_input = {"address" : "Teglverksfaret 14, 1405 Langhus",
              "usable_area" : 97,
              "bedrooms" : 4,
              "floor" : 3,
              "balcony" : 14,
              "eq_lift_True" : True,
              "eq_parking_True" : True,
              "eq_west_facing_True" : True,
              "build_year" : 2021
              }
res = run(user_input,model = m_a)
res

2025-08-04 13:41:43,900 - predictHomes - INFO - 1 rows read from homes. Query: SELECT * EXCEPT (Kommunenr,Kommune,`År`)
    FROM admin.SSB_municipality
    WHERE LOWER(Kommune) = ... (truncated)
2025-08-04 13:41:46,056 - predictHomes - INFO - 1 rows read from homes. Query: 
    SELECT
        price_pr_i_sqm AS ref_price_pr_i_sqm,
        monthly_common_cost_pr_sqm AS ref_... (truncated)


array([4081675.], dtype=float32)

In [84]:
data = build_data(user_input)
columns = list(res_a.get("training_columns").keys())
data_ready = prep_data(data,columns)

2025-08-04 13:38:48,797 - predictHomes - INFO - 1 rows read from homes. Query: SELECT * EXCEPT (Kommunenr,Kommune,`År`)
    FROM admin.SSB_municipality
    WHERE LOWER(Kommune) = ... (truncated)
2025-08-04 13:38:51,120 - predictHomes - INFO - 1 rows read from homes. Query: 
    SELECT
        price_pr_i_sqm AS ref_price_pr_i_sqm,
        monthly_common_cost_pr_sqm AS ref_... (truncated)


In [83]:
data_ready.iloc[0]

joint_debt                             578568.811475
monthly_common_cost                      3675.370998
collective_assets                       16459.833333
bedrooms                                           4
internal_area                              91.172932
usable_area                                       97
external_area                               5.827068
floor                                       2.221805
balcony                                    22.237443
build_year                               2001.656126
rooms                                              5
plot_size                               14677.823529
sqm_pr_bedroom                                 24.25
dealer_True                                     True
fixer_upper_True                               False
eq_parking_True                                False
eq_lift_True                                   False
eq_fireplace_True                              False
eq_charging_possibility_True                  

In [91]:
res_h.get("training_columns")

{'bedrooms': ['Int64', 4],
 'internal_area': ['Int64', 197],
 'usable_area': ['Int64', 172],
 'build_year': ['Int64', 1957],
 'plot_size': ['Int64', 1004],
 'sqm_pr_bedroom': ['Int64', 44],
 'dealer_True': ['boolean', True],
 'fixer_upper_True': ['boolean', False],
 'eq_parking_True': ['boolean', True],
 'eq_lift_True': ['boolean', False],
 'eq_fireplace_True': ['boolean', False],
 'eq_charging_possibility_True': ['boolean', False],
 'eq_aircondition_True': ['boolean', False],
 'eq_garden_True': ['boolean', False],
 'eq_parking_tmp_True': ['boolean', False],
 'eq_lift_tmp_True': ['boolean', False],
 'eq_west_facing_True': ['boolean', False],
 'ownership_type_andel': ['boolean', False],
 'ownership_type_annet': ['boolean', False],
 'ownership_type_eier ': ['boolean', True],
 'ownership_type_obligasjon': ['boolean', False],
 'day': ['Int64', 29],
 'month': ['Int64', 5],
 'year': ['Int64', 2025],
 'property_type_bygård_flermannsbolig': ['boolean', False],
 'property_type_enebolig': ['bool

In [93]:
res_a.get("r2_score")

0.9255631566