In [359]:
import re
import os
import yaml
import json
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [360]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [361]:
query_dir = 'queries'

In [362]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [364]:
query_client = bigquery.Client()

In [365]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [366]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.building_attributes', query_client):
    request = queries['all_buildings_that_have_not_been_processed.sql']
else:
    request = queries['all_buildings.sql']

In [368]:
real_estate_raw = query_client.query(request).to_dataframe()

In [369]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
ad_id             50 non-null int64
new_building      50 non-null bool
property_attrs    50 non-null object
address           50 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 1.3+ KB


In [370]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [371]:
real_estate_raw.head()

Unnamed: 0_level_0,new_building,property_attrs,address
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
116691931,True,"{""Omkostninger"": ""9\u00a0238 \u2013 11\u00a046...","MARKVEGEN 1, 2052 Jessheim"
118944563,True,"{""Areal"": ""118\u00a0-\u00a0141 m\u00b2"", ""Sove...","Grøntveien 14, 1555 Son"
132238709,True,"{""Omkostninger"": ""87\u00a0472 \u2013 118\u00a0...","Svaleveien 6, 1404 Siggerud"
132521410,False,"{""Omkostninger"": ""47\u00a0400 kr"", ""Totalpris""...","Jon Smørs vei 13D, 1397 Nesøya"
134510449,True,"{""Omkostninger"": ""20\u00a0374 \u2013 33\u00a06...","Sydblokka, Torshovhøyden, 0477 Oslo"


In [373]:
real_estate_raw['property_attrs'] = [json.loads(cell) for cell in real_estate_raw['property_attrs']]

In [374]:
all_attributes = ['Boligtype', 'Eieform', 'Soverom', 'Bruksareal',
                  'Primærrom', 'Totalpris', 'Omkostninger',
                  'Byggeår', 'Tomteareal', 'Bruttoareal',
                  'Formuesverdi', 'Energimerking', 'Felleskost/mnd.',
                  'Etasje', 'Rom', 'Fellesformue',
                  'Fellesgjeld', 'Eierskifte-forsikring']

In [375]:
all_attributes_map = {'Boligtype': 'property_type',
                      'Eieform': 'ownership_type',
                      'Soverom': 'num_bedrooms',
                      'Bruksareal': 'usable_area',
                      'Primærrom': 'primary_area',
                      'Totalpris': 'price',
                      'Omkostninger': 'brokerage_expenses',
                      'Byggeår': 'construction_year',
                      'Tomteareal': 'plot_area',
                      'Bruttoareal': 'total_size',
                      'Formuesverdi': 'wealth_value',
                      'Energimerking': 'energy_character',
                      'Felleskost/mnd.': 'common_expenses',
                      'Etasje': 'floor',
                      'Rom': 'num_rooms',
                      'Fellesformue': 'common_wealth',
                      'Fellesgjeld': 'common_debt',
                      'Eierskifte-forsikring': 'ownership_change_insurance'}

In [376]:
numeric_columns = ['num_bedrooms', 'usable_area', 'primary_area',
                   'price', 'brokerage_expenses', 'construction_year',
                   'plot_area', 'total_size', 'wealth_value',
                   'common_expenses', 'floor', 'num_rooms', 'common_wealth',
                   'common_debt']

In [377]:
def extract_property_attributes(data):
    to_append = []
    for ad_id, row in data.property_attrs.items():
        all_properties = []
        all_properties.append(ad_id)
        for key in all_attributes:
            all_properties.append(row.get(key))
        to_append.append(all_properties)
    extracted_data = pd.DataFrame(to_append, columns=['ad_id']+all_attributes)
    extracted_data.rename(columns=all_attributes_map, inplace=True)
    return extracted_data

In [378]:
def get_int_from_str(string):
    if string:
        try:
            concat_string = ''.join(re.findall(r'[\d.]', string))
            if concat_string != '':
                return float(concat_string)
        except Exception as e:
            print(e, string)
            pass
    else:
        return None

In [379]:
building_attributes = extract_property_attributes(real_estate_raw)

In [380]:
building_attributes = building_attributes.merge(real_estate_raw[['new_building']], how='left', left_on='ad_id', right_index=True)

In [381]:
for col in ['num_bedrooms', 'usable_area', 'primary_area',
            'price', 'brokerage_expenses', 'construction_year',
            'plot_area', 'total_size', 'wealth_value',
            'energy_character', 'common_expenses', 'floor',
            'num_rooms', 'common_wealth', 'common_debt',
            'ownership_change_insurance']:
    building_attributes[col] = [None if new_building is True else value
                                for new_building, value in zip(
                                    building_attributes['new_building'],
                                    building_attributes[col])]

In [382]:
building_attributes.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,116691931,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
1,118944563,Enebolig,Eier (Selveier),,,,,,,,,,,,,,,,,True
2,132238709,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
3,132521410,Enebolig,Eier (Selveier),2.0,80 m²,74 m²,6 037 400 kr,47 400 kr,2019.0,900 m² (eiet),,,C -oransje,,,,,,,False
4,134510449,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True


In [383]:
building_attributes_parsed = building_attributes.copy()
for col in numeric_columns:
    building_attributes_parsed[col] = [get_int_from_str(s) for s in building_attributes_parsed[col]]

In [384]:
building_attributes_parsed['ownership_change_insurance'] = [(lambda x: True if x == 'Ja' else False)(v)
                                                            for v
                                                            in building_attributes_parsed['ownership_change_insurance']]

In [385]:
building_attributes_parsed.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,116691931,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
1,118944563,Enebolig,Eier (Selveier),,,,,,,,,,,,,,,,False,True
2,132238709,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
3,132521410,Enebolig,Eier (Selveier),2.0,80.0,74.0,6037400.0,47400.0,2019.0,900.0,,,C -oransje,,,,,,False,False
4,134510449,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True


In [386]:
building_attributes_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
ad_id                         50 non-null int64
property_type                 50 non-null object
ownership_type                50 non-null object
num_bedrooms                  36 non-null float64
usable_area                   35 non-null float64
primary_area                  36 non-null float64
price                         35 non-null float64
brokerage_expenses            35 non-null float64
construction_year             36 non-null float64
plot_area                     34 non-null float64
total_size                    32 non-null float64
wealth_value                  29 non-null float64
energy_character              25 non-null object
common_expenses               25 non-null float64
floor                         26 non-null float64
num_rooms                     23 non-null float64
common_wealth                 12 non-null float64
common_debt                   20 non-null float64
ow

In [387]:
building_attributes_parsed.drop_duplicates(inplace=True)

In [388]:
building_attributes_parsed.shape

(50, 20)

In [389]:
building_attributes_parsed.to_gbq(destination_table='housing_data.building_attributes',
                                  project_id='hde-test-clean',
                                  if_exists='append')