In [1]:
import os
from src.cleaning.common_functions import *
from src.cleaning.common_functions import mk_cat

os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage

import pandas as pd
pd.set_option('display.max_columns', 500)



In [2]:
replace  = True
dataset = 'rentals'
logger = Logger(f'pre_process{dataset.capitalize()}')
bq = BigQuery(logger=logger,dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market-bucket')
logger.debug(f'Replace: {replace} | Dataset: {dataset}')

2025-07-19 20:59:14,781 - pre_processRentals - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/pre_processRentals.log
2025-07-19 20:59:14,785 - pre_processRentals - INFO - BigQuery client initialized with project_id: sibr-market
2025-07-19 20:59:14,787 - pre_processRentals - INFO - Google Cloud Storage client initialized with bucket: sibr-market-bucket
2025-07-19 20:59:14,788 - pre_processRentals - DEBUG - Replace: True | Dataset: rentals


In [3]:
df_ = bq.read_clean(replace = replace)

2025-07-19 20:59:20,129 - pre_processRentals - INFO - Reading clean data from dataset: rentals
2025-07-19 20:59:25,911 - pre_processRentals - INFO - 134063 rows read from rentals. Query: 
            SELECT c.* FROM sibr-market.clean.rentals c
            ... (truncated)


## Set index and remove empty data

In [12]:
df = df_.dropna(subset='item_id')
df.set_index('item_id',inplace=True)
logger.debug(f'Length of df: {len(df)} | after dropping NaN on item_id')

2025-07-19 21:01:20,893 - pre_processRentals - DEBUG - Length of df: 134063 | after dropping NaN on item_id


In [13]:
df.dropna(subset=['monthly_rent','primary_area'],inplace=True)

df = df[(df['monthly_rent'] > 1000) & (df['monthly_rent'] < 50000)
        & ((df['primary_area'] > 0) & (df['primary_area'] < 200))
        ]

drop = ['address','description','contact_person','phone','url','country','title',
        'facilities','deposit','includes','web','email',
        'rn','FIRST','LAST','postal_code','municipality','county','region','last_updated','salgstid','facilities'
        ,'price_pr_bedroom','rent_pr_sqm','clean_date','energy_rating','internal_area','gross_area','usable_area',]

df.drop(columns=drop,inplace=True,errors = 'ignore')
logger.debug(f'Length of df: {len(df)} | after dropping NaN on price, usable_area and bedrooms')

2025-07-19 21:01:21,103 - pre_processRentals - DEBUG - Length of df: 115212 | after dropping NaN on price, usable_area and bedrooms


## DUMMY VARIABLES

In [14]:
df['dealer'] = df['dealer'].apply(lambda x: False if x.lower() == 'private' else True)

In [15]:
prop_type = ['Enebolig',
 'Leilighet',
 'Rom i bofellesskap',
 'Tomannsbolig',
 'Hybel',
 'Andre',
 'Rekkehus']
df = mk_cat(df,'property_type', prop_type)

df = process_bool(df)

logger.debug(f'Length of df: {len(df)} | after dummy variables')

2025-07-19 21:01:21,667 - pre_processRentals - DEBUG - Length of df: 112909 | after dummy variables


## DATE COLUMNS

In [16]:
df['scrape_date'] = pd.to_datetime(df['scrape_date'], errors='coerce')
df['day'] = df['scrape_date'].dt.day
df['month'] = df['scrape_date'].dt.month
df['year'] = df['scrape_date'].dt.year
df.drop('scrape_date',axis=1,inplace=True)
df['pre_processed_date'] = pd.Timestamp.now()
logger.debug(f'Length of df: {len(df)} | after date columns')

2025-07-19 21:01:21,870 - pre_processRentals - DEBUG - Length of df: 112909 | after date columns


## ENSURE CORRECT DATA TYPES

In [17]:
df = ensure_num_types(df)

## REMOVE OUTLIERS AND SPLIT DATA

In [None]:
def rm_rental_outliers(df):
    if 'bedrooms' in df.columns and 'monthly_rent' in df.columns:
        drop_item_ids = df[(df['bedrooms'] == 0) & (df['monthly_rent'] > 20000)]
        df = df[~df['item_id'].isin(drop_item_ids['item_id'])]
    return df

In [18]:
df_r = df[(df['property_type'] != 'rom i bofellesskap') & (df['property_type'] != 'hybel')]
df_k = df[(df['property_type'] == 'rom i bofellesskap')]
df_ord = df[df['property_type'] != 'rom i bofellesskap']

df_r = rm_rental_outliers(df_r)
df_ord = rm_rental_outliers(df_ord)
logger.debug(f'Length of df_r: {len(df_r)} | Length of df_k: {len(df_k)} | Length of df_ord: {len(df_ord)}')

2025-07-19 21:01:22,770 - pre_processRentals - DEBUG - Length of df_r: 85514 | Length of df_k: 12581 | Length of df_ord: 100328


## ORDINARY RENTALS

In [19]:
df_ord = pd.get_dummies(df_ord, columns=['property_type'], drop_first=True)

In [20]:
logger.debug(f'Length of df_ord: {len(df_ord)} | before saving to BQ. Replace is {replace}')
if replace:
    bq.to_bq(df_ord,
             table_name='rentals',
             dataset_name='pre_processed',
             if_exists='replace')
else:
    bq.to_bq(df_ord,
             table_name='rentals',
             dataset_name='pre_processed',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-19 21:01:24,150 - pre_processRentals - DEBUG - Length of df_ord: 100328 | before saving to BQ. Replace is True
2025-07-19 21:01:28,222 - pre_processRentals - INFO - 100328 rader lagret i sibr-market.pre_processed.rentals


## HOUSING RENTALS

In [21]:
drop_rentals = ['dealer_True']
df_r = df_r.drop(columns=drop_rentals)
df_r = pd.get_dummies(df_r,columns=['property_type'],drop_first=True)

In [22]:
logger.debug(f'Length of df_r: {len(df_r)} | Before saving to BQ. Replace is {replace}')
if replace:
    bq.to_bq(df_r,
             table_name='rentals_homes',
             dataset_name='pre_processed',
             if_exists='replace')
else:
    bq.to_bq(df_r,
             table_name='rentals_homes',
             dataset_name='pre_processed',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-19 21:01:30,687 - pre_processRentals - DEBUG - Length of df_r: 85514 | Before saving to BQ. Replace is True
2025-07-19 21:01:33,519 - pre_processRentals - INFO - 85514 rader lagret i sibr-market.pre_processed.rentals_homes


## Co-living rentals

In [23]:
df_k = df_k.drop(columns=['dealer_True','property_type','bedrooms','sqm_pr_bedroom','primary_area'], errors='ignore')

In [24]:
logger.debug(f'Length of df_k: {len(df_k)} | Before saving to BQ. Replace is {replace}')
table_name = 'rentals_co-living'
if replace:
    bq.to_bq(df_k,
             table_name=table_name,
             dataset_name='pre_processed',
             if_exists='replace')
else:
    bq.to_bq(df_k,
             table_name=table_name,
             dataset_name='pre_processed',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-19 21:01:33,573 - pre_processRentals - DEBUG - Length of df_k: 12581 | Before saving to BQ. Replace is True
2025-07-19 21:01:38,494 - pre_processRentals - INFO - 12581 rader lagret i sibr-market.pre_processed.rentals_co-living
