In [1]:
import re
import os
import yaml
import json
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [2]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [3]:
query_dir = 'queries'

In [4]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [5]:
query_client = bigquery.Client()

In [6]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [7]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.building_attributes', query_client):
    request = queries['all_buildings_that_have_not_been_processed.sql']
else:
    request = queries['all_buildings.sql']

In [8]:
real_estate_raw = query_client.query(request).to_dataframe()

In [9]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 791 entries, 0 to 790
Data columns (total 4 columns):
ad_id             791 non-null int64
new_building      791 non-null bool
property_attrs    791 non-null object
address           789 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 19.4+ KB


In [10]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [11]:
real_estate_raw.head()

Unnamed: 0_level_0,new_building,property_attrs,address
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
71962400,True,"{""Areal"": ""129 m\u00b2"", ""Soverom"": ""3-3"", ""Bo...","Fjellhamar Torg, 1472 Fjellhamar"
76674446,True,"{""Fellesgjeld"": ""2\u00a0500\u00a0000 \u2013 3\...","Landstadsgate 13, 2004 Lillestrøm"
77962792,True,"{""Omkostninger"": ""22\u00a0665 kr"", ""Totalpris""...","Gamle Lommedalsvei 123, 1348 Rykkinn"
83008842,True,"{""Areal"": ""68 m\u00b2"", ""Soverom"": ""3-3"", ""Bol...","Mørtelverksbakken 27, 0580 Oslo"
87990816,True,"{""Omkostninger"": ""22\u00a0650 kr"", ""Totalpris""...","Røakollen - Aslakveien 20 - hus C, 0753 Oslo"


In [12]:
real_estate_raw['property_attrs'] = [json.loads(cell) for cell in real_estate_raw['property_attrs']]

In [13]:
all_attributes = ['Boligtype', 'Eieform', 'Soverom', 'Bruksareal',
                  'Primærrom', 'Totalpris', 'Omkostninger',
                  'Byggeår', 'Tomteareal', 'Bruttoareal',
                  'Formuesverdi', 'Energimerking', 'Felleskost/mnd.',
                  'Etasje', 'Rom', 'Fellesformue',
                  'Fellesgjeld', 'Eierskifte-forsikring']

In [14]:
all_attributes_map = {'Boligtype': 'property_type',
                      'Eieform': 'ownership_type',
                      'Soverom': 'num_bedrooms',
                      'Bruksareal': 'usable_area',
                      'Primærrom': 'primary_area',
                      'Totalpris': 'price',
                      'Omkostninger': 'brokerage_expenses',
                      'Byggeår': 'construction_year',
                      'Tomteareal': 'plot_area',
                      'Bruttoareal': 'total_size',
                      'Formuesverdi': 'wealth_value',
                      'Energimerking': 'energy_character',
                      'Felleskost/mnd.': 'common_expenses',
                      'Etasje': 'floor',
                      'Rom': 'num_rooms',
                      'Fellesformue': 'common_wealth',
                      'Fellesgjeld': 'common_debt',
                      'Eierskifte-forsikring': 'ownership_change_insurance'}

In [15]:
numeric_columns = ['num_bedrooms', 'usable_area', 'primary_area',
                   'price', 'brokerage_expenses', 'construction_year',
                   'plot_area', 'total_size', 'wealth_value',
                   'common_expenses', 'floor', 'num_rooms', 'common_wealth',
                   'common_debt']

In [16]:
def extract_property_attributes(data):
    to_append = []
    for ad_id, row in data.property_attrs.items():
        all_properties = []
        all_properties.append(ad_id)
        for key in all_attributes:
            all_properties.append(row.get(key))
        to_append.append(all_properties)
    extracted_data = pd.DataFrame(to_append, columns=['ad_id']+all_attributes)
    extracted_data.rename(columns=all_attributes_map, inplace=True)
    return extracted_data

In [17]:
def get_int_from_str(string):
    if string:
        try:
            concat_string = ''.join(re.findall(r'[\d.]', string))
            if concat_string != '':
                return float(concat_string)
        except Exception as e:
            print(e, string)
            pass
    else:
        return None

In [18]:
building_attributes = extract_property_attributes(real_estate_raw)

In [19]:
building_attributes = building_attributes.merge(real_estate_raw[['new_building']], how='left', left_on='ad_id', right_index=True)

In [20]:
for col in ['num_bedrooms', 'usable_area', 'primary_area',
            'price', 'brokerage_expenses', 'construction_year',
            'total_size', 'common_expenses', 'floor',
            'num_rooms', 'common_wealth',
            'ownership_change_insurance']:
    building_attributes[col] = [None if new_building is True else value
                                for new_building, value in zip(
                                    building_attributes['new_building'],
                                    building_attributes[col])]

In [21]:
building_attributes.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,71962400,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
1,76674446,Leilighet,Andel,,,,,,,,,,,,,,,2 500 000 – 3 450 000 kr,,True
2,77962792,Leilighet,Eier (Selveier),,,,,,,,,,B -rød,,,,,,,True
3,83008842,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
4,87990816,Leilighet,Eier (Selveier),,,,,,,,,,C -lysegrønn,,,,,,,True


In [22]:
building_attributes_parsed = building_attributes.copy()
for col in numeric_columns:
    building_attributes_parsed[col] = [get_int_from_str(s) for s in building_attributes_parsed[col]]

In [23]:
building_attributes_parsed['ownership_change_insurance'] = [(lambda x: True if x == 'Ja' else False)(v)
                                                            for v
                                                            in building_attributes_parsed['ownership_change_insurance']]

In [24]:
building_attributes_parsed.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,71962400,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
1,76674446,Leilighet,Andel,,,,,,,,,,,,,,,25000000000000.0,False,True
2,77962792,Leilighet,Eier (Selveier),,,,,,,,,,B -rød,,,,,,False,True
3,83008842,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
4,87990816,Leilighet,Eier (Selveier),,,,,,,,,,C -lysegrønn,,,,,,False,True


In [25]:
building_attributes_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 865 entries, 0 to 790
Data columns (total 20 columns):
ad_id                         865 non-null int64
property_type                 863 non-null object
ownership_type                863 non-null object
num_bedrooms                  682 non-null float64
usable_area                   706 non-null float64
primary_area                  718 non-null float64
price                         687 non-null float64
brokerage_expenses            687 non-null float64
construction_year             714 non-null float64
plot_area                     685 non-null float64
total_size                    640 non-null float64
wealth_value                  575 non-null float64
energy_character              629 non-null object
common_expenses               644 non-null float64
floor                         612 non-null float64
num_rooms                     554 non-null float64
common_wealth                 462 non-null float64
common_debt                   453

In [26]:
building_attributes_parsed.drop_duplicates(inplace=True)

In [27]:
building_attributes_parsed.shape

(761, 20)

In [28]:
building_attributes_parsed.to_gbq(destination_table='housing_data.building_attributes',
                                  project_id='hde-test-clean',
                                  if_exists='append')