## CLEAN RENTALS

In [23]:
import re
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq as gbq
from google.cloud import bigquery
pd.set_option('display.max_columns', None)
replace  = False

In [24]:
if replace:
    sql = '''
    SELECT
    t.*
FROM
  (SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY item_id ORDER BY scrape_date DESC) as rn
      FROM sibr-market.raw.rentals) t
WHERE t.rn = 1
    '''
else:
    sql = '''
    SELECT
    t.*
FROM
  (SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY item_id ORDER BY scrape_date DESC) as rn
      FROM sibr-market.raw.rentals) t
LEFT JOIN sibr-market.clean.rentals h ON t.item_id = h.item_id
WHERE t.rn = 1
AND h.item_id IS NULL
    '''
try:
    client = bigquery.Client()
    job = client.query(sql)
    df_ = job.to_dataframe()
    if len(df_) == 0:
        raise ValueError('No results for query')
    print(f'length of df: {len(df_)}')
except:
    raise ValueError(f'Could not initiate data!')

length of df: 125065


In [25]:
%%bigquery geo
SELECT * FROM admin.geo_norge

Query is running:   0%|          |

Downloading:   0%|          |

In [26]:
if replace:
    sql = '''
        SELECT
          nd.item_id,
          PARSE_DATE('%Y-%m-%d', MIN(nd.scrape_date)) AS FIRST,
          PARSE_DATE('%Y-%m-%d', MAX(nd.scrape_date)) AS LAST,
          NULLIF( DATE_DIFF( PARSE_DATE('%Y-%m-%d', MAX(nd.scrape_date)), PARSE_DATE('%Y-%m-%d', MIN(nd.scrape_date)), DAY), 0) AS salgstid
        FROM
          `sibr-market.raw.rentals` AS nd
        WHERE
          nd.item_id NOT IN
          (
          SELECT item_id
              FROM `sibr-market.raw.rentals`
              WHERE
                scrape_date = (SELECT MIN(scrape_date) FROM`sibr-market.raw.rentals`)
                OR
                scrape_date = (SELECT MAX(scrape_date) FROM`sibr-market.raw.rentals` )
                )
        GROUP BY
          1;
  '''
else:
    sql = '''
        SELECT
          nd.item_id,
          PARSE_DATE('%Y-%m-%d', MIN(nd.scrape_date)) AS FIRST,
          PARSE_DATE('%Y-%m-%d', MAX(nd.scrape_date)) AS LAST,
          NULLIF( DATE_DIFF( PARSE_DATE('%Y-%m-%d', MAX(nd.scrape_date)), PARSE_DATE('%Y-%m-%d', MIN(nd.scrape_date)), DAY), 0) AS salgstid
        FROM
          `sibr-market.raw.rentals` AS nd
          --LEFT JOIN sibr-market.clean.rentals c ON c.item_id = nd.item_id
        WHERE
          nd.item_id NOT IN
          (
          SELECT item_id
              FROM `sibr-market.raw.rentals`
              WHERE
                scrape_date = (SELECT MIN(scrape_date) FROM`sibr-market.raw.rentals`)
                OR
                scrape_date = (SELECT MAX(scrape_date) FROM`sibr-market.raw.rentals` )
                )
        --AND c.item_id IS NULL
        GROUP BY
          1;
  '''

try:
    if not client:
        client = bigquery.Client()
    job_salgstid = client.query(sql)
    salgstid = job_salgstid.to_dataframe()
    if len(df_) == 0:
        raise ValueError('No results for query')
    print(f'length of salgstid: {len(df_)}')
except:
    raise ValueError(f'Could not initiate data!')

length of salgstid: 125065


## Remove empty feature and missing data

In [27]:
df = df_.drop_duplicates(subset='item_id')
null_val = ['nan','None','','null','NULL','NA','np.nan','<NA>','NaN','NAType',np.nan]
df = df.replace(null_val,np.nan)

for col in df.columns:
  if df[col].isna().sum() / len(df) > 0.9:
    df.drop(col,axis=1,inplace=True)

  df = df.replace(null_val,np.nan)


In [28]:
df = pd.merge(df, salgstid, how='left', on='item_id')

## Int and Float data

In [29]:
def extract_int(x: str) -> int | None:
    """
    Trekker ut det første tallet fra en streng og konverterer det til et heltall.
    Håndterer desimaltall korrekt (f.eks. "2.0" blir 2).
    """
    # Finner den første sekvensen som ser ut som et tall (inkludert punktum)
    treff = re.search(r'[\d.]+', x)

    if treff:
        nummer_str = treff.group(0)
        try:
            # 1. Konverter til float for å håndtere desimaler
            nummer_float = float(nummer_str)
            # 2. Konverter floaten til et heltall (kutter desimalene)
            return int(nummer_float)
        except ValueError:
            # Håndterer tilfeller der strengen ikke er et gyldig tall (f.eks. "1.2.3")
            return None
            
    return None

In [30]:
int_cols = ['monthly_rent','deposit','bedrooms','floor',
            'primary_area',
            'usable_area','internal_area','gross_area','primary_area',
            'external_area'
            ]
for col in int_cols:
    if col in df.columns:
        df.loc[:,col] = df[col].apply(lambda x: extract_int(x) if isinstance(x, str) else x)

df = df[(df['monthly_rent']>1000) & (df['monthly_rent']<300000) &
      ( (df['usable_area']>0) & (df['usable_area']<1000))
        ]

df.loc[:,'bedrooms'] = df['bedrooms'].fillna(0)
df.loc[:, 'primary_area'] = df['primary_area'].fillna(df['usable_area']).fillna(df['internal_area'])

df.loc[:,'rent_pr_sqm'] = df.apply(lambda x: x['monthly_rent']*12 / x['usable_area'] if x['usable_area'] > 0 else 0, axis=1)
df.loc[:,'price_pr_bedroom'] = df.apply(lambda x: x['monthly_rent']*12 / x['bedrooms'] if x['bedrooms'] > 0 else 0, axis=1)
df.loc[:,'sqm_pr_bedroom'] = df.apply(lambda x: x['usable_area'] / x['bedrooms'] if x['bedrooms'] > 0 else 0, axis=1)

  df.loc[:,'bedrooms'] = df['bedrooms'].fillna(0)
  df.loc[:, 'primary_area'] = df['primary_area'].fillna(df['usable_area']).fillna(df['internal_area'])


# Datetime Data

In [31]:
df.loc[:,'scrape_date'] = pd.to_datetime(df['scrape_date'],errors='coerce')

## Geographical data

In [32]:
def extract_postnummer(x):
  if not isinstance(x, str):
        return x

  else:
    match_ = re.search(r'\d{4}',x)
    if match_:
      return match_.group()

df.loc[:,'postal_code'] = df['address'].apply(extract_postnummer)
df = pd.merge(df,geo[['postal_code','municipality','county','region']],how='left',on='postal_code')

## Categorical Data

In [33]:
df.loc[:,'property_type'] = df['property_type'].str.replace(r'^boligtype', '', case=False, regex=True)
df.loc[:,'property_type'] = df['property_type'].str.replace(r'\s*/småbruk', '', case=False, regex=True)
df.loc[:,'property_type'] = df['property_type'].str.replace(r'/$', '', regex=True)
df.loc[:,'dealer'] = df['dealer'].fillna('private')
cat_cols = ['dealer','municipality','county','region','property_type','country']
for col in cat_cols:
  df.loc[:,col] = df[col].astype('category')

## Boolean data

## Save to BQ

In [34]:
if replace:
    gbq.to_gbq(dataframe = df,
               destination_table='clean.rentals',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    gbq.to_gbq(dataframe = df,
               destination_table='clean.rentals',
               project_id = 'sibr-market',
               if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 8035.07it/s]


## EDA