# TRAINING APARTMENTS

In [1]:
import os
from examples.help_modules import BigQuery, ConfigReader, Logger,CStorage
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,r2_score
import joblib
os.chdir("../..")

In [2]:
p = ConfigReader().get_config().get('bq_path')
logger = Logger('trainRentals',CREDENTIALS_PATH=p)
bq = BigQuery(logger,p)
cs = CStorage(logger = logger,bucket_name='sibr-market-bucket', CREDENTIALS_PATH=p)
p

2025-06-08 22:27:03,781 - trainRentals - INFO - Google Cloud Logging initialized with project: sibr-market
2025-06-08 22:27:03,782 - trainRentals - INFO - All loggs successfully initiated
2025-06-08 22:27:03,819 - trainRentals - INFO - BigQuery client initialized with project_id: sibr-market


'config/market-keys.json'

In [3]:
#READ IN DATA
sql_a = '''
SELECT a.*,c.lat,c.lng FROM `sibr-market.pre_processed.homes_apartments` a
JOIN admin.coordinates c ON c.item_id = a.item_id
WHERE c.lat != 0;
'''
sql_h = '''
SELECT h.*,c.lat,c.lng FROM `sibr-market.pre_processed.homes_houses` h
JOIN admin.coordinates c ON c.item_id = h.item_id
WHERE c.lat != 0;
'''
df_a = bq.read_bq(sql_a,read_type="pandas_gbq")
df_a.set_index('item_id', inplace=True)
df_h = bq.read_bq(sql_h,read_type="pandas_gbq")
df_h.set_index('item_id', inplace=True)

Downloading: 100%|[32m██████████[0m|

2025-06-08 22:27:15,296 - trainRentals - INFO - 16809 rader lest fra BigQuery



Downloading: 100%|[32m██████████[0m|

2025-06-08 22:27:19,366 - trainRentals - INFO - 12411 rader lest fra BigQuery





## Model Training with XGBoost | Apartments

In [15]:
def train(df,params,target , dataset_name,model):
    logger.info(f'TRAINING {model().__class__.__name__} model for {dataset_name.upper()} \n \n')
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=[target], axis=1),
        df[target],
        test_size=0.2,
        # stratify=y_binned,
        random_state=42)
    logger.info(f"Train set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    pipeline = Pipeline([
            ('impute', SimpleImputer()),
            ('scaler', StandardScaler()),
            ('model', model()),
        ])

    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    logger.info(f'MSE test: {mse},r2_score test: {r2}, trainscore: {pipeline.score(X_train, y_train)} for {dataset_name}')

    local_filepath = f'/tmp/{model().__class__.__name__}.pkl'
    joblib.dump(pipeline, local_filepath)
    cs.upload(local_filepath, f'models/{model().__class__.__name__}_{dataset_name}.pkl')
    os.remove(local_filepath)
    return pipeline

In [16]:
params_a = {'model__learning_rate': np.float64(0.03317316825297632), 'model__max_depth': 5, 'model__n_estimators': 1482, 'model__random_state': 69, 'model__subsample': np.float64(0.8042989210310263)}

params_h = {'model__learning_rate': np.float64(0.02102146663035812), 'model__max_depth': 6, 'model__n_estimators': 1454, 'model__random_state': 91, 'model__subsample': np.float64(0.7425191352307899)}
pipline_a = train(df_a,params_a,target = 'price' ,dataset_name='apartments',model = XGBRegressor)
pipeline_h = train(df_h, params_h,target = 'price' ,dataset_name='houses',model = XGBRegressor)

2025-06-08 22:40:53,959 - trainRentals - INFO - TRAINING XGBRegressor model for APARTMENTS 
 

2025-06-08 22:40:53,988 - trainRentals - INFO - Train set size: 13447, Test set size: 3362
2025-06-08 22:40:56,087 - trainRentals - INFO - MSE test: 539287194162.6983,r2_score test: 0.9269906946863538, trainscore: 0.9862155956466966 for apartments
2025-06-08 22:40:56,489 - trainRentals - INFO - File /tmp/XGBRegressor.pkl uploaded to models/XGBRegressor_apartments.pkl in bucket sibr-market-bucket.
2025-06-08 22:40:56,506 - trainRentals - INFO - TRAINING XGBRegressor model for HOUSES 
 

2025-06-08 22:40:56,517 - trainRentals - INFO - Train set size: 9928, Test set size: 2483
2025-06-08 22:40:58,802 - trainRentals - INFO - MSE test: 519836134179.7185,r2_score test: 0.9637003207581909, trainscore: 0.9949728756103037 for houses
2025-06-08 22:40:59,463 - trainRentals - INFO - File /tmp/XGBRegressor.pkl uploaded to models/XGBRegressor_houses.pkl in bucket sibr-market-bucket.


In [14]:
XGBRegressor().__class__.__name__

'XGBRegressor'