In [6]:
import re
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq as gbq
from google.cloud import bigquery
from sqlglot.transforms import eliminate_semi_and_anti_joins
pd.set_option('display.max_columns', 500)
replace = False

In [7]:


if replace:
    sql = '''
    SELECT c.* FROM sibr-market.clean.rentals c
    '''
else:
    sql = '''
    SELECT c.* FROM sibr-market.clean.rentals c
    LEFT JOIN sibr-market.pre_processed.rentals p USING(item_id)
    WHERE p.item_id IS NULL
    '''

try:
    client = bigquery.Client()
    job = client.query(sql)
    df_ = job.to_dataframe()
    if len(df_) == 0:
        raise ValueError('No results for query')
    print(f'length of df: {len(df_)}')
except:
    raise ValueError(f'Could not initiate data!')

length of df: 14700


## Set index and remove empty data

In [8]:
df = df_.dropna(subset='item_id')
df.set_index('item_id',inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14700 entries, 357864085.0 to 367836067
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   dealer            14700 non-null  object        
 1   address           14700 non-null  object        
 2   monthly_rent      14700 non-null  float64       
 3   deposit           9579 non-null   float64       
 4   includes          10874 non-null  object        
 5   property_type     14700 non-null  object        
 6   bedrooms          14700 non-null  float64       
 7   floor             11771 non-null  float64       
 8   usable_area       14700 non-null  float64       
 9   internal_area     5894 non-null   float64       
 10  gross_area        5948 non-null   float64       
 11  facilities        14586 non-null  object        
 12  last_updated      14654 non-null  object        
 13  url               14700 non-null  object        
 14  description  

In [10]:
df.dropna(subset=['monthly_rent','usable_area'],inplace=True)

drop = ['address','description','contact_person','phone','url','country','title',
        'facilities','deposit','includes','web','email',
        'rn','FIRST','LAST','postal_code','municipality','county','region','last_updated','salgstid','facilities'
        ,'price_pr_bedroom','rent_pr_sqm']

df.drop(columns=drop,inplace=True,errors = 'ignore')

## DUMMY VARIABLES

In [11]:
df['dealer'] = df['dealer'].apply(lambda x: False if x.lower() == 'private' else True)

In [12]:
dummy_cols = ['dealer',
              'property_type']
for col in dummy_cols:
    for cat, num in df[col].value_counts(normalize = True).items():
        if num < 0.01:
            df = df[df[col] != cat]
df = pd.get_dummies(df,columns=dummy_cols,drop_first=True)

## DATE COLUMNS

In [13]:
df['day'] = df['scrape_date'].dt.day
df['month'] = df['scrape_date'].dt.month
df['year'] = df['scrape_date'].dt.year
df.drop('scrape_date',axis=1,inplace=True)

## ORDINARY RENTALS

In [14]:
if replace:
    gbq.to_gbq(dataframe = df,
               destination_table='pre_processed.rentals',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    gbq.to_gbq(dataframe = df,
               destination_table='pre_processed.rentals',
               project_id = 'sibr-market',
               if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 8035.07it/s]


p## HOUSING RENTALS

In [15]:
drop_rentals = ['internal_area','gross_area','primary_area','dealer_True']
df_r = df.drop(columns=drop_rentals)
df_r = df_r[(df_r['property_type_Rom i bofellesskap'] != 1) & (df_r['property_type_Hybel'] != 1)]
df_r.drop(columns=['property_type_Rom i bofellesskap','property_type_Hybel'],inplace=True)

In [16]:
if replace:
    gbq.to_gbq(dataframe = df_r,
               destination_table='pre_processed.rentals_homes',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    gbq.to_gbq(dataframe = df_r,
                   destination_table='pre_processed.rentals_homes',
                   project_id = 'sibr-market',
                    if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 6213.78it/s]
