In [262]:
import re
import yaml
import json
import pandas as pd
from google.cloud import bigquery

In [264]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [235]:
query_client = bigquery.Client()

In [236]:
request = """
SELECT
    *
FROM
    `hde-test-clean.housing_data.real_estate_raw`
"""
real_estate_raw = query_client.query(request).to_dataframe()

In [237]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 11 columns):
ad_id                1281 non-null int64
ad_url               1281 non-null object
new_building         1281 non-null bool
short_description    1281 non-null object
full_description     1281 non-null object
property_table       1281 non-null object
main_price           1281 non-null object
property_attrs       1281 non-null object
address              1281 non-null object
datetime_viewed      1281 non-null datetime64[ns]
datetime_offset      1281 non-null object
dtypes: bool(1), datetime64[ns](1), int64(1), object(8)
memory usage: 101.4+ KB


In [238]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [239]:
real_estate_raw.head()

Unnamed: 0_level_0,ad_url,new_building,short_description,full_description,property_table,main_price,property_attrs,address,datetime_viewed,datetime_offset
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
130898040,https://www.finn.no/realestate/homes/ad.html?f...,False,Sentral og moderne 3-roms selveierleilighet me...,Kort om leiligheten:- MODERNE LEILIGHET FRA 20...,{},5 200 000 kr,"{""Omkostninger"": ""138\u00a0722 kr"", ""Totalpris...","Fredtunveien 8, 1386 Asker",2019-06-09 10:36:02,UTC
126578832,https://www.finn.no/realestate/newbuildings/ad...,True,SENNERUDTOPPEN - ARBEIDENE ER I GANG! 87 lyse ...,FremdriftArbeidene er i gang! Ta kontakt for t...,"{""Bolig"": [""1-207"", ""1-307"", ""1-308"", ""1-407"",...",2 790 000 kr7 890 000 kr,"{""Areal"": ""45\u00a0-\u00a0125 m\u00b2"", ""Sover...","Sennerudtoppen Bolig AS, 1920 Sørumsand",2019-06-09 10:30:23,UTC
135914307,https://www.finn.no/realestate/newbuildings/ad...,True,Dovrekvartalet - Spennende boligprosjekt for v...,Prosjektets beskrivelseUtbyggers visjonFor å s...,"{""Bolig"": [""101"", ""102"", ""103"", ""104"", ""105"", ...",1 600 000 kr4 900 000 kr,"{""Fellesgjeld"": ""1\u00a0600\u00a0000 \u2013 4\...","Landstadsgate 13 - D5, 2000 Lillestrøm",2019-06-09 10:39:41,UTC
142183291,https://www.finn.no/realestate/homes/ad.html?f...,False,Tiltalende enebolig fra 2015. Solrik terrasse ...,Tiltalende og moderne enebolig med svært etter...,{},14 775 000 kr,"{""Omkostninger"": ""384\u00a0520 kr"", ""Totalpris...","Bjerkelundsveien 4A, 1358 Jar",2019-06-09 10:41:33,UTC
149524358,https://www.finn.no/realestate/homes/ad.html?f...,False,Stor og påkostet enebolig over 2 plan fra 2013...,DNB Eiendom v/Christer Langstrand har gleden a...,{},6 390 000 kr,"{""Omkostninger"": ""160\u00a0972 kr"", ""Totalpris...","Kantarellvegen 14, 2016 Frogner",2019-06-09 10:32:42,UTC


In [240]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [241]:
all_attributes = ['Boligtype', 'Eieform', 'Soverom', 'Bruksareal',
                  'Primærrom', 'Totalpris', 'Omkostninger',
                  'Byggeår', 'Tomteareal', 'Bruttoareal',
                  'Formuesverdi', 'Energimerking', 'Felleskost/mnd.',
                  'Etasje', 'Rom', 'Fellesformue',
                  'Fellesgjeld', 'Eierskifte-forsikring']

In [242]:
all_attributes_map = {'Boligtype': 'property_type',
                      'Eieform': 'ownership_type',
                      'Soverom': 'num_bedrooms',
                      'Bruksareal': 'usable_area',
                      'Primærrom': 'primary_area',
                      'Totalpris': 'price',
                      'Omkostninger': 'brokerage_expenses',
                      'Byggeår': 'construction_year',
                      'Tomteareal': 'plot_area',
                      'Bruttoareal': 'total_size',
                      'Formuesverdi': 'wealth_value',
                      'Energimerking': 'energy_character',
                      'Felleskost/mnd.': 'common_expenses',
                      'Etasje': 'floor',
                      'Rom': 'num_rooms',
                      'Fellesformue': 'common_wealth',
                      'Fellesgjeld': 'common_debt',
                      'Eierskifte-forsikring': 'ownership_change_insurance'}

In [243]:
numeric_columns = ['num_bedrooms', 'usable_area', 'primary_area',
                   'price', 'brokerage_expenses', 'construction_year',
                   'plot_area', 'total_size', 'wealth_value',
                   'common_expenses', 'floor', 'num_rooms', 'common_wealth',
                   'common_debt']

In [244]:
def extract_property_attributes(data):
    to_append = []
    for ad_id, row in data.property_attrs.items():
        all_properties = []
        all_properties.append(ad_id)
        for key in all_attributes:
            all_properties.append(row.get(key))
        to_append.append(all_properties)
    extracted_data = pd.DataFrame(to_append, columns=['ad_id']+all_attributes)
    extracted_data.rename(columns=all_attributes_map, inplace=True)
    return extracted_data

In [245]:
def get_int_from_str(string):
    if string:
        try:
            concat_string = ''.join(re.findall(r'[\d.]', string))
            if concat_string != '':
                return float(concat_string)
        except Exception as e:
            print(e, string)
            pass
    else:
        return None

In [246]:
building_attributes = extract_property_attributes(real_estate_raw[real_estate_raw.new_building == False])

In [247]:
building_attributes.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance
0,130898040,Leilighet,Eier (Selveier),2,80 m²,77 m²,5 338 722 kr,138 722 kr,2008,2975 m² (eiet),86 m²,1 008 866 kr,D -rød,3 284 kr,2.0,3.0,55 319 kr,,
1,142183291,Enebolig,Eier (Selveier),6,296 m²,252 m²,15 159 520 kr,384 520 kr,2015,542 m² (eiet),335 m²,3 734 398 kr,B -rød,,,,,,Ja
2,149524358,Enebolig,Eier (Selveier),5,194 m²,194 m²,6 550 972 kr,160 972 kr,2013,835 m² (eiet),210 m²,1 474 695 kr,B -oransje,,2.0,,,,
3,149593602,Tomannsbolig,Eier (Selveier),5,235 m²,167 m²,7 190 150 kr,190 150 kr,1963,1497 m² (eiet),264 m²,1 644 817 kr,G -oransje,,,,,,
4,149473690,Enebolig,Eier (Selveier),3,146 m²,146 m²,10 456 222 kr,256 222 kr,1911,621 m² (eiet),165 m²,2 055 161 kr,G -rød,,3.0,4.0,,,


In [248]:
building_attributes_parsed = building_attributes.copy()
for col in numeric_columns:
    building_attributes_parsed[col] = [get_int_from_str(s) for s in building_attributes_parsed[col]]

In [249]:
building_attributes_parsed['ownership_change_insurance'] = [(lambda x: True if x == 'Ja' else False)(v)
                                                            for v
                                                            in building_attributes_parsed['ownership_change_insurance']]

In [250]:
building_attributes_parsed.head()

Unnamed: 0,ad_id,property_type,ownership_type,num_bedrooms,usable_area,primary_area,price,brokerage_expenses,construction_year,plot_area,total_size,wealth_value,energy_character,common_expenses,floor,num_rooms,common_wealth,common_debt,ownership_change_insurance
0,130898040,Leilighet,Eier (Selveier),2.0,80.0,77.0,5338722.0,138722.0,2008.0,2975.0,86.0,1008866.0,D -rød,3284.0,2.0,3.0,55319.0,,False
1,142183291,Enebolig,Eier (Selveier),6.0,296.0,252.0,15159520.0,384520.0,2015.0,542.0,335.0,3734398.0,B -rød,,,,,,True
2,149524358,Enebolig,Eier (Selveier),5.0,194.0,194.0,6550972.0,160972.0,2013.0,835.0,210.0,1474695.0,B -oransje,,2.0,,,,False
3,149593602,Tomannsbolig,Eier (Selveier),5.0,235.0,167.0,7190150.0,190150.0,1963.0,1497.0,264.0,1644817.0,G -oransje,,,,,,False
4,149473690,Enebolig,Eier (Selveier),3.0,146.0,146.0,10456222.0,256222.0,1911.0,621.0,165.0,2055161.0,G -rød,,3.0,4.0,,,False


In [251]:
building_attributes_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1172 entries, 0 to 1171
Data columns (total 19 columns):
ad_id                         1172 non-null int64
property_type                 1172 non-null object
ownership_type                1172 non-null object
num_bedrooms                  1132 non-null float64
usable_area                   1166 non-null float64
primary_area                  1164 non-null float64
price                         1093 non-null float64
brokerage_expenses            1088 non-null float64
construction_year             1165 non-null float64
plot_area                     1140 non-null float64
total_size                    1066 non-null float64
wealth_value                  1034 non-null float64
energy_character              959 non-null object
common_expenses               850 non-null float64
floor                         826 non-null float64
num_rooms                     770 non-null float64
common_wealth                 639 non-null float64
common_debt        

In [252]:
building_attributes_parsed.drop_duplicates(inplace=True)

In [253]:
building_attributes_parsed.shape

(614, 19)

In [261]:
building_attributes_parsed.to_gbq(destination_table='housing_data.building_attributes',
                                  project_id='hde-test-clean',
                                  if_exists='append')