In [1]:
import os
from prediction_commonfunctions import *
os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage

import pandas as pd



In [2]:
dataset = 'cars'
logger = Logger(f'predict{dataset.capitalize()}')
bq = BigQuery(logger=logger, dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market-bucket')
save_to_bq = True
logger.debug(f'Dataset: {dataset}')

2025-06-20 12:51:44,631 - predictCars - INFO - 
 Starting new session. All loggs successfully initiated
2025-06-20 12:51:44,636 - predictCars - INFO - BigQuery client initialized with project_id: sibr-market
2025-06-20 12:51:44,638 - predictCars - INFO - Google Cloud Storage client initialized with bucket: sibr-market-bucket
2025-06-20 12:51:44,639 - predictCars - DEBUG - Dataset: cars


In [None]:
models = cs.download('models.json', read_in_file=True)
models['created_at'] = pd.to_datetime(models['created_at'], unit='ms')

In [3]:
df_el = bq.read_preprocessed(table='cars_el', last_scrape_date=True, coordinates=False)
df_fossil = bq.read_preprocessed(table='cars_fossil', last_scrape_date=True, coordinates=False)

2025-06-20 12:52:15,429 - predictCars - INFO - 12337 rows read from cars. Query: SELECT a.* FROM `sibr-market.p... (truncated)


2025-06-20 12:52:30,373 - predictCars - INFO - 32225 rows read from cars. Query: SELECT a.* FROM `sibr-market.p... (truncated)


In [8]:
res_el = models[models['dataset']=='cars_el'].iloc[0].to_dict()
res_fossil = models[models['dataset']=='cars_fossil'].iloc[0].to_dict()
m_el = cs.download(res_el.get('filename'), read_in_file=True)
m_fossil = cs.download(res_fossil.get('filename'), read_in_file=True)

y_pred_el = predict_data(dataframe = df_el,
                              pipeline = m_el,
                              target = res_el.get(f'target'),
                              log_target=res_el.get(f'log_target'),
                              training_columns=res_el.get('training_columns'))
y_pred_fossil = predict_data(dataframe = df_fossil,
                                  pipeline = m_fossil,
                                  target = res_fossil.get(f'target'),
                                  log_target = res_fossil.get('log_target'),
                                  training_columns = res_fossil.get('training_columns'))

2025-06-20 13:00:09,523 - predictCars - INFO - Read in CatBoostRegressor_el.pkl
2025-06-20 13:00:11,484 - predictCars - INFO - Read in CatBoostRegressor_fossil.pkl


In [9]:
pred_el = pd.DataFrame({
    'item_id': df_el.index,
    'predicted_price': y_pred_el,
    'model': 'el'
})
pred_fossil = pd.DataFrame({
    'item_id': df_fossil.index,
    'predicted_price': y_pred_fossil,
    'model': 'fossil'
})
pred = pd.concat([pred_el, pred_fossil], ignore_index=False)
pred['predict_date'] = pd.Timestamp.now()

if save_to_bq:
    if not pred.empty:
        save_data(df=pred, table_name=dataset)
else:
    logger.warning('No data saved to BQ as save_to_bq is set to False.')