## PRE-PROCESS HOMES

In [3]:
import re
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq as gbq
from google.cloud import bigquery
pd.set_option('display.max_columns', 500)
replace = False

In [4]:
if replace:
    sql = '''
            SELECT m.* FROM sibr-market.clean.homes m
    '''
else:
    sql = '''
            SELECT m.* FROM sibr-market.clean.homes m
            LEFT JOIN sibr-market.pre_processed.homes_rentals a USING(item_id)
            WHERE a.item_id IS NULL
    '''

try:
    client = bigquery.Client()
    job = client.query(sql)
    df_ = job.to_dataframe()
    if len(df_) == 0:
        raise ValueError('No results for query')
    print(f'length of df: {len(df_)}')
except:
    raise ValueError(f'Could not initiate data!')

length of df: 52551


## Set index and remove empty data

In [5]:
df = df_.dropna(subset='item_id')
df.set_index('item_id',inplace=True)

In [6]:
df.dropna(subset=['price','usable_area','bedrooms'],inplace=True)

drop = ['district','address','title','sold',
        'description','email','contact_person','phone','url','new','country',
        'facilities','energy_rating','internal_area',
        'rn','FIRST','LAST','postal_code','municipality','county','region','last_updated','salgstid','facilities','tax_value',
        'total_price','price_pr_bedroom','price_pr_sqm','web','cadastral_num','unit_num','section_num']

df.drop(columns=drop,inplace=True)

## DUMMY VARIABLES

In [7]:
df['dealer'] = df['dealer'].apply(lambda x: False if x.lower() == 'private' else True)

In [8]:
#df['energy_letter'] = df['energy_rating'].apply(lambda x: x.split(" - ")[0] if isinstance(x,str) and x.split(" - ") else x)
#df['energy_color'] = df['energy_rating'].apply(lambda x: x.split(" - ")[1] if isinstance(x,str) and len(x.split(" - "))>1 else x)
#df.drop('energy_rating',axis=1,inplace=True)

In [9]:
dummy_cols = ['dealer','fixer-upper',
              #'energy_letter','energy_color',
              'ownership_type','property_type']

for col in dummy_cols:
    for cat, num in df[col].value_counts(normalize = True).items():
        if num < 0.01:
            df = df[df[col] != cat]
dummy_cols.remove('property_type')
df = pd.get_dummies(df,columns=dummy_cols,drop_first=True)

## DATE COLUMNS

In [10]:
df['day'] = df['scrape_date'].dt.day
df['month'] = df['scrape_date'].dt.month
df['year'] = df['scrape_date'].dt.year
df.drop('scrape_date',axis=1,inplace=True)

## SPLIT INTO APARTMENTS AND HOUSES AND RENTALS

In [11]:
df_a = df[df['property_type'] == 'Leilighet']
df_h = df[df['property_type'] != 'Leilighet']
rental_cols = ['property_type','bedrooms','floor','usable_area','day','month','year','sqm_pr_bedroom']
df_r = df[rental_cols]

## APARTMENTS

In [12]:
df_a = pd.get_dummies(df_a,columns=['property_type'],drop_first=True)
df_a.loc[:, 'joint_debt'] = df_a['joint_debt'].fillna(0)
df_a.loc[:, 'collective_assets'] = df_a['collective_assets'].fillna(0)
df_a.loc[:, 'balcony'] = df_a['balcony'].fillna(0)
df_a.loc[:, 'floor'] = df_a['floor'].fillna(0)
df_a.loc[:, 'rooms'] = df_a['rooms'].fillna(df_a['bedrooms']+1)
df_a.loc[:, 'external_area'] = df_a['external_area'].fillna(0)
df_a.loc[:, 'monthly_common_cost'] = df_a['monthly_common_cost'].fillna(0)

In [13]:
if replace:
    gbq.to_gbq(dataframe = df_a,
               destination_table='pre_processed.homes_apartments',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    gbq.to_gbq(dataframe = df_a,
               destination_table='pre_processed.homes_apartments',
               project_id = 'sibr-market',
                if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 9962.72it/s]


## HOUSES

In [14]:
drop = ['collective_assets','joint_debt','balcony','floor',
        'monthly_common_cost','rooms','external_area']
df_h.drop(columns=drop,inplace=True)
df_h = pd.get_dummies(df_h,columns=['property_type'],drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h.drop(columns=drop,inplace=True)


In [15]:
if replace:
    gbq.to_gbq(dataframe = df_h,
               destination_table='pre_processed.homes_houses',
               project_id = 'sibr-market',
               if_exists='replace')
else:
    gbq.to_gbq(dataframe = df_h,
               destination_table='pre_processed.homes_houses',
               project_id = 'sibr-market',
                if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 11275.01it/s]


## RENTAL PREDICTION

In [16]:
df_r = pd.get_dummies(df_r,columns=['property_type'],drop_first=True)

In [17]:
if replace:
    gbq.to_gbq(dataframe = df_r,
           destination_table='pre_processed.homes_rentals',
           project_id = 'sibr-market',
           if_exists='replace')
else:
    gbq.to_gbq(dataframe = df_r,
               destination_table='pre_processed.homes_rentals',
               project_id = 'sibr-market',
                if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 2592.28it/s]
