## CLEAN RENTALS

In [1]:
from common_functions import *
import numpy as np
import pandas_gbq as gbq
from src.cleaning.common_functions import *
pd.set_option('display.max_columns', None)
import os
os.chdir("../..")
from src.sibr_market_training import Clean
from sibr_module import BigQuery, Logger, CStorage



In [2]:
replace  = True
dataset = 'rentals'
logger = Logger(f'clean{dataset.capitalize()}')
bq = BigQuery(logger=logger,dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market-bucket')
logger.debug(f'Replace: {replace} | Dataset: {dataset}')

2025-07-19 10:53:12,734 - cleanRentals - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/cleanRentals.log
2025-07-19 10:53:12,737 - cleanRentals - INFO - BigQuery client initialized with project_id: sibr-market
2025-07-19 10:53:12,738 - cleanRentals - INFO - Google Cloud Storage client initialized with bucket: sibr-market-bucket
2025-07-19 10:53:12,739 - cleanRentals - DEBUG - Replace: True | Dataset: rentals


In [3]:
df_ = bq.read_raw(replace=replace)
geo = bq.read_geonorge()
salgstid = bq.read_salestime(replace = replace)

2025-07-19 10:53:13,343 - cleanRentals - INFO - Reading raw data from dataset: raw.rentals
2025-07-19 10:53:27,823 - cleanRentals - INFO - 137561 rows read from rentals. Query: 
                        SELECT
                            t.*
                        FROM
       ... (truncated)


Downloading: 100%|[32m██████████[0m|

2025-07-19 10:53:29,577 - cleanRentals - INFO - 5137 rows read from rentals. Query: SELECT * FROM admin.geo_norge... (truncated)
2025-07-19 10:53:29,578 - cleanRentals - INFO - Reading salestime data from dataset: rentals





2025-07-19 10:53:33,422 - cleanRentals - INFO - 43991 rows read from rentals. Query: 
            SELECT
              nd.item_id,
              PARSE_DATE('%Y-%m-%d', MIN(nd.scrape_dat... (truncated)


## Remove empty feature and missing data

In [4]:
logger.debug(f'Length of df before cleaning: {len(df_)}')
df  =  transform_nan(df_)
df = rm_empty_features(df = df)
logger.debug(f'Length: {len(df)} | after removing columns with >90% missing values')

2025-07-19 10:53:35,673 - cleanRentals - DEBUG - Length of df before cleaning: 137561
2025-07-19 10:53:36,793 - cleanRentals - DEBUG - Length: 137561 | after removing columns with >90% missing values


In [5]:
df = pd.merge(df, salgstid, how='left', on='item_id')
logger.debug(f'Length: {len(df)} | after merge with sales time')

2025-07-19 10:53:37,544 - cleanRentals - DEBUG - Length: 137561 | after merge with sales time


## Int and Float data

In [21]:
int_cols = ['monthly_rent', 'deposit', 'bedrooms', 'floor',
            'primary_area',
            'usable_area', 'internal_area', 'gross_area', 'primary_area',
            'external_area'
            ]
df = mk_num(df, int_cols, type='int')

#logger.debug(f'Num cols After transform: \n {df[["monthly_rent", "bedrooms", "floor", ]].head(10)}')

df.loc[:, 'bedrooms'] = df['bedrooms'].fillna(0)
df.loc[:, 'floor'] = df['floor'].fillna(0)

if 'primary_area' in df.columns and 'usable_area' in df.columns:
    df.loc[:, 'primary_area'] = df['primary_area'].fillna(df['usable_area'])
df.loc[:, 'primary_area'] = df['primary_area'].fillna(0)

df = mk_fractions(df, new_feat_name='rent_pr_sqm', numerator='monthly_rent', denominator='primary_area')
df = mk_fractions(df, new_feat_name='price_pr_bedroom', numerator='monthly_rent', denominator='bedrooms')
df = mk_fractions(df, new_feat_name='sqm_pr_bedroom', numerator='primary_area', denominator='bedrooms')
df['rent_pr_sqm'] = df['rent_pr_sqm'] * 12
df['price_pr_bedroom'] = df['price_pr_bedroom'] * 12

df = df[(df['monthly_rent'] > 1000) & (df['monthly_rent'] < 300000)
        & ((df['primary_area'] >= 0) & (df['primary_area'] < 1000))
        & ((df['bedrooms'] >= 0) & (df['bedrooms'] < 10))
        & ((df['floor'] >= 0) & (df['floor'] < 100))
        ]
logger.debug(f'Length: {len(df)} | after filter price and usable_area')

2025-07-09 14:31:58,035 - cleanRentals - DEBUG - Length: 134063 | after filter price and usable_area


## Geographical data

In [22]:
df.loc[:,'postal_code'] = df['address'].apply(extract_postnummer)
df = pd.merge(df,geo[['postal_code','municipality','county','region']],how='left',on='postal_code')
logger.debug(f'Length: {len(df)} | after geo')

2025-07-09 14:31:58,224 - cleanRentals - DEBUG - Length: 134063 | after geo


## Categorical Data

In [23]:
df.loc[:,'property_type'] = df['property_type'].str.replace(r'^boligtype', '', case=False, regex=True)
# df.loc[:,'property_type'] = df['property_type'].str.replace(r'\s*/småbruk', '', case=False, regex=True)
# df['property_type'] = df['property_type'].str.replace(r'\s*/flermannsbolig', '', case=False, regex=True)
# df.loc[:,'property_type'] = df['property_type'].str.replace(r'/$', '', regex=True)
df['property_type'] = df['property_type'].apply(lambda x: x.replace('/',"_") if isinstance(x, str) else x)
df.loc[:,'dealer'] = df['dealer'].fillna('private')
cat_cols = ['dealer','municipality','county','region','property_type','country']
logger.debug(f'Length: {len(df)} | after categorical')

2025-07-09 14:31:58,328 - cleanRentals - DEBUG - Length: 134063 | after categorical


## Boolean data

In [25]:
def clean_includes(text):
    if not isinstance(text, str):
        return text

    text = text.lower().strip()
    # Handle specific cases first (longer strings)
    replacements = [
        ("inkludererbredbånd", "internett"),
        ("inkludererinternett", "internett"),
        ("inkludererstrøm", "strøm"),
        ("inkluderervarmtvann", "varmtvann"),
        # Then do the general replacement
        ("inkluderer", "")
    ]

    for old, new in replacements:
        text = text.replace(old, new)

    return text
df['includes'] = df['includes'].apply(clean_includes)
bool_dict = {'eq_power': ['strøm','strøm og internett'],
             'eq_internet': ['internett','bredbånd','fiber','strøm og internett','wifi','grunnpakke internett og grunnpakke tv er inkludert i leien.','internet','tv og internett'],
             'eq_tv': ['tv','kabel-tv','kabel-/digital-tv','kabel tv','kabeltv','grunnpakke internett og grunnpakke tv er inkludert i leien.','tv og internett'],
             'eq_hot_water': ['varmtvann','varmt vann'],
             'eq_water': ['vann'],
             'eq_heating': ['oppvarming','oppvarming fra vannbåren varme'],
             'eq_parking': ['parkering','parkeringsplass'],
             'eq_household_appliances': ['hvitevarer','vaskemaskin','tørketrommel','oppvaskmaskin'],
             'eq_furniture': ['møbler','møblert'],}

df = mk_bool_features(df = df, equipment_features=bool_dict, source_col='includes')
logger.debug(f'Length: {len(df)} | after bool features')

2025-07-09 14:31:58,912 - cleanRentals - DEBUG - Length: 134063 | after bool features


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134063 entries, 0 to 134062
Data columns (total 42 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   dealer                   134063 non-null  object 
 1   address                  134063 non-null  object 
 2   monthly_rent             134063 non-null  Int64  
 3   deposit                  108584 non-null  Int64  
 4   includes                 134063 non-null  object 
 5   property_type            134060 non-null  object 
 6   bedrooms                 134063 non-null  Int64  
 7   floor                    134063 non-null  Int64  
 8   usable_area              39343 non-null   Int64  
 9   internal_area            28621 non-null   Int64  
 10  gross_area               26091 non-null   Int64  
 11  facilities               133727 non-null  object 
 12  last_updated             133907 non-null  object 
 13  item_id                  134063 non-null  object 
 14  url 

## Ensure extra columns and dtypes

In [27]:
extra_cols = ['email', 'web', 'energy_rating']
df = add_missing_features(df,extra_cols)

## Save to BQ

In [28]:
logger.debug(f'Length: {len(df)} | before saving to BQ. Replace is {replace}')
if replace:
    gbq.to_gbq(dataframe = df,
               destination_table='clean.rentals',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    bq.to_bq(df,
             table_name='rentals',
             dataset_name='clean',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-09 14:32:05,866 - cleanRentals - DEBUG - Length: 134063 | before saving to BQ. Replace is True
100%|██████████| 1/1 [00:00<00:00, 11008.67it/s]


## EDA

In [None]:
df['property_type'].value_counts()

In [None]:
df['property_type'].unique().tolist()
