In [295]:
import re
import yaml
import json
import pandas as pd
from google.cloud import bigquery

In [296]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [297]:
query_client = bigquery.Client()

In [299]:
request = """
SELECT
    * 
FROM
    `hde-test-clean.housing_data.real_estate_raw`
"""
real_estate_raw = query_client.query(request).to_dataframe()

In [300]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 11 columns):
ad_id                1470 non-null int64
ad_url               1470 non-null object
new_building         1470 non-null bool
short_description    1470 non-null object
full_description     1470 non-null object
property_table       1470 non-null object
main_price           1470 non-null object
property_attrs       1470 non-null object
address              1470 non-null object
datetime_viewed      1470 non-null datetime64[ns]
datetime_offset      1470 non-null object
dtypes: bool(1), datetime64[ns](1), int64(1), object(8)
memory usage: 116.4+ KB


In [301]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [302]:
real_estate_raw.head()

Unnamed: 0_level_0,ad_url,new_building,short_description,full_description,property_table,main_price,property_attrs,address,datetime_viewed,datetime_offset
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
150295217,https://www.finn.no/realestate/homes/ad.html?f...,False,Nydelig villa fra 2010 - Herlige terrasser og ...,"Lys, romslig og moderne enebolig over tre plan...",{},18 000 000 kr,"{""Omkostninger"": ""465\u00a0222 kr"", ""Totalpris...","Heimlibakken 4, 0198 Oslo",2019-06-17 08:30:38,UTC
147528692,https://www.finn.no/realestate/homes/ad.html?f...,False,Romslig 3-roms hjørneleilighet - Moderne bygg ...,Proaktiv Properties ved Remi Bangsund har gled...,{},4 600 000 kr,"{""Omkostninger"": ""12\u00a0520 kr"", ""Totalpris""...","Paal Bergs vei 6, 0692 Oslo",2019-06-17 08:31:10,UTC
129682033,https://www.finn.no/realestate/newbuildings/ad...,True,Vestby/Solehøyden: 12 SOLGT! Unike selveierlei...,Om prosjektetSolehøyden er et nytt og spennend...,"{""Bolig"": [""01"", ""02"", ""03"", ""04"", ""05"", ""07"",...",3 200 000 kr6 100 000 kr,"{""Areal"": ""57\u00a0-\u00a093 m\u00b2"", ""Sovero...","SOLEHØYDEN, 1540 Vestby",2019-06-17 08:30:51,UTC
109234698,https://www.finn.no/realestate/newbuildings/ad...,True,Sentrumshagen - Ferdigstilte selveierleilighet...,OppdragsansvarligPrivatMegleren Nyeboliger AS ...,"{""Bolig"": [""C103"", ""C304"", ""C404"", ""C504"", ""D1...",4 125 000 kr5 365 000 kr,"{""Omkostninger"": ""23\u00a0259 \u2013 27\u00a03...","Sentrumshagen-Jessheim, 2050 Jessheim",2019-06-17 08:30:52,UTC
150241968,https://www.finn.no/realestate/homes/ad.html?f...,False,Innholdsrik villa m/arkitektur - 5 soverom - A...,Velkommen til Fridtjof Nansens vei 8 - Børre G...,{},16 900 000 kr,"{""Omkostninger"": ""439\u00a0722 kr"", ""Totalpris...","Fridtjof Nansens vei 8, 1366 Lysaker",2019-06-17 08:30:58,UTC


In [303]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [304]:
all_attributes = ['Boligtype', 'Eieform', 'Soverom', 'Bruksareal',
                  'Primærrom', 'Totalpris', 'Omkostninger',
                  'Byggeår', 'Tomteareal', 'Bruttoareal',
                  'Formuesverdi', 'Energimerking', 'Felleskost/mnd.',
                  'Etasje', 'Rom', 'Fellesformue',
                  'Fellesgjeld', 'Eierskifte-forsikring']

In [305]:
all_attributes_map = {'Boligtype': 'property_type',
                      'Eieform': 'ownership_type',
                      'Soverom': 'num_bedrooms',
                      'Bruksareal': 'usable_area',
                      'Primærrom': 'primary_area',
                      'Totalpris': 'price',
                      'Omkostninger': 'brokerage_expenses',
                      'Byggeår': 'construction_year',
                      'Tomteareal': 'plot_area',
                      'Bruttoareal': 'total_size',
                      'Formuesverdi': 'wealth_value',
                      'Energimerking': 'energy_character',
                      'Felleskost/mnd.': 'common_expenses',
                      'Etasje': 'floor',
                      'Rom': 'num_rooms',
                      'Fellesformue': 'common_wealth',
                      'Fellesgjeld': 'common_debt',
                      'Eierskifte-forsikring': 'ownership_change_insurance'}

In [306]:
numeric_columns = ['num_bedrooms', 'usable_area', 'primary_area',
                   'price', 'brokerage_expenses', 'construction_year',
                   'plot_area', 'total_size', 'wealth_value',
                   'common_expenses', 'floor', 'num_rooms', 'common_wealth',
                   'common_debt']

In [307]:
def extract_property_attributes(data):
    to_append = []
    for ad_id, row in data.property_attrs.items():
        all_properties = []
        all_properties.append(ad_id)
        for key in all_attributes:
            all_properties.append(row.get(key))
        to_append.append(all_properties)
    extracted_data = pd.DataFrame(to_append, columns=['ad_id']+all_attributes)
    extracted_data.rename(columns=all_attributes_map, inplace=True)
    return extracted_data

In [308]:
def get_int_from_str(string):
    if string:
        try:
            concat_string = ''.join(re.findall(r'[\d.]', string))
            if concat_string != '':
                return float(concat_string)
        except Exception as e:
            print(e, string)
            pass
    else:
        return None

In [309]:
building_attributes = extract_property_attributes(real_estate_raw)

In [310]:
building_attributes = building_attributes.merge(real_estate_raw[['new_building']], how='left', left_on='ad_id', right_index=True)

In [312]:
for col in ['num_bedrooms', 'usable_area', 'primary_area',
            'price', 'brokerage_expenses', 'construction_year',
            'plot_area', 'total_size', 'wealth_value',
            'energy_character', 'common_expenses', 'floor',
            'num_rooms', 'common_wealth', 'common_debt',
            'ownership_change_insurance']:
    building_attributes[col] = [None if new_building is True else value
                                for new_building, value in zip(
                                    building_attributes['new_building'],
                                    building_attributes[col])]

In [313]:
building_attributes.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,150295217,Enebolig,Eier (Selveier),4.0,200 m²,196 m²,18 465 222 kr,465 222 kr,2010.0,731 m² (eiet),220 m²,3 143 130 kr,C -oransje,,,6.0,,,,False
1,147528692,Leilighet,Andel,2.0,84 m²,75 m²,4 612 520 kr,12 520 kr,2004.0,5842 m² (eiet),91 m²,1 135 669 kr,E -mørkegrønn,3 244 kr,2.0,,,,,False
2,129682033,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
3,109234698,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,,True
4,150241968,Enebolig,Eier (Selveier),5.0,268 m²,268 m²,17 339 722 kr,439 722 kr,1981.0,2011 m² (eiet),295 m²,2 754 958 kr,F -oransje,,2.0,8.0,,,,False


In [314]:
building_attributes_parsed = building_attributes.copy()
for col in numeric_columns:
    building_attributes_parsed[col] = [get_int_from_str(s) for s in building_attributes_parsed[col]]

In [315]:
building_attributes_parsed['ownership_change_insurance'] = [(lambda x: True if x == 'Ja' else False)(v)
                                                            for v
                                                            in building_attributes_parsed['ownership_change_insurance']]

In [316]:
building_attributes_parsed.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance,new_building
0,150295217,Enebolig,Eier (Selveier),4.0,200.0,196.0,18465222.0,465222.0,2010.0,731.0,220.0,3143130.0,C -oransje,,,6.0,,,False,False
1,147528692,Leilighet,Andel,2.0,84.0,75.0,4612520.0,12520.0,2004.0,5842.0,91.0,1135669.0,E -mørkegrønn,3244.0,2.0,,,,False,False
2,129682033,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
3,109234698,Leilighet,Eier (Selveier),,,,,,,,,,,,,,,,False,True
4,150241968,Enebolig,Eier (Selveier),5.0,268.0,268.0,17339722.0,439722.0,1981.0,2011.0,295.0,2754958.0,F -oransje,,2.0,8.0,,,False,False


In [317]:
building_attributes_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3110 entries, 0 to 1469
Data columns (total 20 columns):
ad_id                         3110 non-null int64
property_type                 3110 non-null object
ownership_type                3110 non-null object
num_bedrooms                  2755 non-null float64
usable_area                   2828 non-null float64
primary_area                  2826 non-null float64
price                         2651 non-null float64
brokerage_expenses            2638 non-null float64
construction_year             2827 non-null float64
plot_area                     2771 non-null float64
total_size                    2585 non-null float64
wealth_value                  2514 non-null float64
energy_character              2327 non-null object
common_expenses               2081 non-null float64
floor                         2043 non-null float64
num_rooms                     1878 non-null float64
common_wealth                 1583 non-null float64
common_debt   

In [318]:
building_attributes_parsed.drop_duplicates(inplace=True)

In [319]:
building_attributes_parsed.shape

(867, 20)

In [261]:
building_attributes_parsed.to_gbq(destination_table='housing_data.building_attributes',
                                  project_id='hde-test-clean',
                                  if_exists='append')