In [1]:
import re
import os
import yaml
import json
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [2]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [3]:
query_dir = 'queries'

In [4]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [5]:
query_client = bigquery.Client()

In [6]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [7]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.building_attributes', query_client):
    request = queries['all_buildings_that_have_not_been_processed.sql']
else:
    request = queries['all_buildings.sql']

In [9]:
real_estate_raw = query_client.query(request).to_dataframe()

In [10]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 4 columns):
ad_id             466 non-null int64
new_building      466 non-null bool
property_attrs    466 non-null object
address           465 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 11.5+ KB


In [11]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [12]:
real_estate_raw.head()

Unnamed: 0_level_0,new_building,property_attrs,address
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78653360,True,"{""Omkostninger"": ""20\u00a0406 \u2013 22\u00a01...","Strandvegen 1, 2005 Rælingen"
78866228,True,"{""Fellesgjeld"": ""0 kr"", ""Areal"": ""70\u00a0-\u0...","FLÅTESTADVEIEN 3, 1415 Oppegård"
78866228,True,"{""Fellesgjeld"": ""0 kr"", ""Areal"": ""70\u00a0-\u0...","FLÅTESTADVEIEN 3, 1415 Oppegård"
82173385,True,"{""Fellesgjeld"": ""3\u00a0200\u00a0000 \u2013 4\...","Røakollen - Aslakveien 20 - hus B, 0753 Oslo"
84047772,True,"{""Fellesgjeld"": ""3\u00a0575\u00a0000 \u2013 4\...","Røakollen - Aslakveien 20 - Hus A, 0753 Oslo"


In [13]:
real_estate_raw['property_attrs'] = [json.loads(cell) for cell in real_estate_raw['property_attrs']]

In [14]:
all_attributes = ['Boligtype', 'Eieform', 'Soverom', 'Bruksareal',
                  'Primærrom', 'Totalpris', 'Omkostninger',
                  'Byggeår', 'Tomteareal', 'Bruttoareal',
                  'Formuesverdi', 'Energimerking', 'Felleskost/mnd.',
                  'Etasje', 'Rom', 'Fellesformue',
                  'Fellesgjeld', 'Eierskifte-forsikring']

In [15]:
all_attributes_map = {'Boligtype': 'property_type',
                      'Eieform': 'ownership_type',
                      'Soverom': 'num_bedrooms',
                      'Bruksareal': 'usable_area',
                      'Primærrom': 'primary_area',
                      'Totalpris': 'price',
                      'Omkostninger': 'brokerage_expenses',
                      'Byggeår': 'construction_year',
                      'Tomteareal': 'plot_area',
                      'Bruttoareal': 'total_size',
                      'Formuesverdi': 'wealth_value',
                      'Energimerking': 'energy_character',
                      'Felleskost/mnd.': 'common_expenses',
                      'Etasje': 'floor',
                      'Rom': 'num_rooms',
                      'Fellesformue': 'common_wealth',
                      'Fellesgjeld': 'common_debt',
                      'Eierskifte-forsikring': 'ownership_change_insurance'}

In [16]:
numeric_columns = ['num_bedrooms', 'usable_area', 'primary_area',
                   'price', 'brokerage_expenses', 'construction_year',
                   'plot_area', 'total_size', 'wealth_value',
                   'common_expenses', 'floor', 'num_rooms', 'common_wealth',
                   'common_debt']

In [17]:
def extract_property_attributes(data):
    to_append = []
    for ad_id, row in data.property_attrs.items():
        all_properties = []
        all_properties.append(ad_id)
        for key in all_attributes:
            all_properties.append(row.get(key))
        to_append.append(all_properties)
    extracted_data = pd.DataFrame(to_append, columns=['ad_id']+all_attributes)
    extracted_data.rename(columns=all_attributes_map, inplace=True)
    return extracted_data

In [18]:
def get_int_from_str(string):
    if string:
        try:
            concat_string = ''.join(re.findall(r'[\d.]', string))
            if concat_string != '':
                return float(concat_string)
        except Exception as e:
            print(e, string)
            pass
    else:
        return None

In [19]:
building_attributes = extract_property_attributes(real_estate_raw)

In [20]:
building_attributes = building_attributes.merge(real_estate_raw[['new_building']], how='left', left_on='ad_id', right_index=True)

In [21]:
for col in ['num_bedrooms', 'usable_area', 'primary_area',
            'price', 'brokerage_expenses', 'construction_year',
            'total_size', 'common_expenses', 'floor',
            'num_rooms', 'common_wealth',
            'ownership_change_insurance']:
    building_attributes[col] = [None if new_building is True else value
                                for new_building, value in zip(
                                    building_attributes['new_building'],
                                    building_attributes[col])]

In [22]:
building_attributes.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,78653360,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
1,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0 kr,,True
1,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0 kr,,True
2,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0 kr,,True
2,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0 kr,,True


In [23]:
building_attributes_parsed = building_attributes.copy()
for col in numeric_columns:
    building_attributes_parsed[col] = [get_int_from_str(s) for s in building_attributes_parsed[col]]

In [24]:
building_attributes_parsed['ownership_change_insurance'] = [(lambda x: True if x == 'Ja' else False)(v)
                                                            for v
                                                            in building_attributes_parsed['ownership_change_insurance']]

In [25]:
building_attributes_parsed.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,78653360,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
1,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0.0,False,True
1,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0.0,False,True
2,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0.0,False,True
2,78866228,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,0.0,False,True


In [26]:
building_attributes_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 562 entries, 0 to 465
Data columns (total 20 columns):
ad_id                         562 non-null int64
property_type                 561 non-null object
ownership_type                561 non-null object
num_bedrooms                  391 non-null float64
usable_area                   399 non-null float64
primary_area                  403 non-null float64
price                         389 non-null float64
brokerage_expenses            388 non-null float64
construction_year             406 non-null float64
plot_area                     387 non-null float64
total_size                    371 non-null float64
wealth_value                  308 non-null float64
energy_character              349 non-null object
common_expenses               287 non-null float64
floor                         276 non-null float64
num_rooms                     257 non-null float64
common_wealth                 179 non-null float64
common_debt                   241

In [27]:
building_attributes_parsed.drop_duplicates(inplace=True)

In [28]:
building_attributes_parsed.shape

(418, 20)

In [29]:
building_attributes_parsed.to_gbq(destination_table='housing_data.building_attributes',
                                  project_id='hde-test-clean',
                                  if_exists='append')