In [1]:
import re
import json
import pandas as pd
from google.cloud import bigquery

In [2]:
%load_ext google.cloud.bigquery

In [56]:
%%bigquery
SELECT
    count(*)
FROM `hde-test-clean.housing_data.real_estate_raw`

Unnamed: 0,f0_
0,1245


In [4]:
query_client = bigquery.Client()

In [5]:
request = """
SELECT
    *
FROM
    `hde-test-clean.housing_data.real_estate_raw`
"""
real_estate_raw = query_client.query(request).to_dataframe()

In [6]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1245 entries, 0 to 1244
Data columns (total 11 columns):
ad_id                1245 non-null int64
ad_url               1245 non-null object
new_building         1245 non-null bool
short_description    1245 non-null object
full_description     1245 non-null object
property_table       1245 non-null object
main_price           1245 non-null object
property_attrs       1245 non-null object
address              1245 non-null object
datetime_viewed      1245 non-null datetime64[ns]
datetime_offset      1245 non-null object
dtypes: bool(1), datetime64[ns](1), int64(1), object(8)
memory usage: 98.6+ KB


In [7]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [8]:
real_estate_raw.head()

Unnamed: 0_level_0,ad_url,new_building,short_description,full_description,property_table,main_price,property_attrs,address,datetime_viewed,datetime_offset
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
149269674,https://www.finn.no/realestate/homes/ad.html?f...,False,Ny og delikat 3-roms selveierleil. midt i hjer...,Velkommen til Kantorveien 4 B - moderne og fun...,{},7 850 000 kr,"{""Omkostninger"": ""27\u00a0042 kr"", ""Totalpris""...","Kantorveien 4B, 1410 Kolbotn",2019-06-11 12:38:54,UTC
149797548,https://www.finn.no/realestate/homes/ad.html?f...,False,Arealeffektiv og sentral 2-roms selveier leili...,Aktiv Eiendomsmegling ved Celine B. S. Holm ha...,{},1 690 000 kr,"{""Fellesgjeld"": ""32\u00a0000 kr"", ""Omkostninge...","Herkulesvegen 77D, 2165 Hvam",2019-06-11 12:41:27,UTC
149814042,https://www.finn.no/realestate/homes/ad.html?f...,False,Gjennomgående 3-roms med balkong l Idyllisk ut...,Velkommen til Ammerudveien 51! Leiligheten lig...,{},2 600 000 kr,"{""Fellesgjeld"": ""172\u00a0000 kr"", ""Omkostning...","Ammerudveien 51, 0958 Oslo",2019-06-11 12:40:53,UTC
149828158,https://www.finn.no/realestate/homes/ad.html?f...,False,Stor og vakker leilighet - to balkonger - stor...,Velkommen til denne fantastiske leiligheten!De...,{},8 000 000 kr,"{""Fellesgjeld"": ""99\u00a0566 kr"", ""Omkostninge...","Waldemar Thranes gate 66 D, 0173 Oslo",2019-06-11 12:40:32,UTC
147199162,https://www.finn.no/realestate/homes/ad.html?f...,False,Unikt renoveringsobjekt. 7-roms ene- el. 2-man...,Huset har til nå fylt rollen som generasjonsbo...,{},8 700 000 kr,"{""Omkostninger"": ""229\u00a0822 kr"", ""Totalpris...","Lalienveien 23, 1453 Bjørnemyr",2019-06-11 12:39:13,UTC


In [9]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [17]:
def expanand_property_tables(new_buildings):
    cols = new_buildings.iloc[0]['property_table'].keys()
    col_map = {'Bolig': 'apt_id',
               'Soverom': 'num_bedrooms',
               'Etg': 'floor',
               'P-rom': 'primary_size',
               'BRA': 'total_size',
               'Totalpris': 'price'}
    property_tables = pd.DataFrame(columns=cols)
    for code, row in new_buildings['property_table'].iteritems():
        all_properties = [arr for arr in row.values()]
        to_append = dict(zip(cols, all_properties))
        frame = pd.DataFrame(to_append, columns=cols)
        frame['ad_id'] = code
        property_tables = property_tables.append(frame, ignore_index=True, sort=False)
    property_tables.rename(columns=col_map, inplace=True)
    return property_tables

In [42]:
sub_properties = expanand_property_tables(real_estate_raw[(real_estate_raw.new_building == True) & (real_estate_raw.property_table)])

In [43]:
sub_properties.head()

Unnamed: 0,apt_id,num_bedrooms,floor,primary_size,total_size,price,ad_id
0,B1,3,1.0,140 m²,143 m²,15 290 922 kr,126757425.0
1,H1,3,1.0,156 m²,156 m²,22 102 622 kr,126757425.0
2,H2,3,2.0,159 m²,159 m²,23 104 572 kr,126757425.0
3,03,3,1.0,178 m²,192 m²,11 475 000 kr,96916306.0
4,04,4,2.0,170 m²,180 m²,11 375 000 kr,96916306.0


In [44]:
def get_int_from_str(string):
    try:
        concat_string = ''.join(re.findall(r'[\d.]', string))
        if concat_string != '':
            return float(concat_string)
    except Exception as e:
        print(e, string)
        pass

In [45]:
sub_properties_parsed = sub_properties.copy()
for col in ['primary_size', 'floor', 'price', 'num_bedrooms', 'total_size']:
    sub_properties_parsed[col] = [get_int_from_str(s) for s in sub_properties_parsed[col]]

expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None


In [46]:
sub_properties_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091 entries, 0 to 1090
Data columns (total 7 columns):
apt_id          1091 non-null object
num_bedrooms    1089 non-null float64
floor           974 non-null float64
primary_size    791 non-null float64
total_size      1088 non-null float64
price           1086 non-null float64
ad_id           1091 non-null float64
dtypes: float64(6), object(1)
memory usage: 59.7+ KB


In [47]:
sub_properties_parsed.head()

Unnamed: 0,apt_id,num_bedrooms,floor,primary_size,total_size,price,ad_id
0,B1,3.0,1.0,140.0,143.0,15290922.0,126757425.0
1,H1,3.0,1.0,156.0,156.0,22102622.0,126757425.0
2,H2,3.0,2.0,159.0,159.0,23104572.0,126757425.0
3,03,3.0,1.0,178.0,192.0,11475000.0,96916306.0
4,04,4.0,2.0,170.0,180.0,11375000.0,96916306.0


In [48]:
sub_properties_parsed.drop_duplicates(subset=['apt_id', 'num_bedrooms', 'floor', 'primary_size', 'total_size', 'price'], inplace=True)

In [49]:
sub_properties_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 613 entries, 0 to 1088
Data columns (total 7 columns):
apt_id          613 non-null object
num_bedrooms    611 non-null float64
floor           538 non-null float64
primary_size    469 non-null float64
total_size      612 non-null float64
price           610 non-null float64
ad_id           613 non-null float64
dtypes: float64(6), object(1)
memory usage: 38.3+ KB


In [58]:
expanded_sub_properties_schema = [{'name': 'apt_id', 'type': 'STRING'},
                                  {'name': 'num_bedrooms', 'type': 'INTEGER'},
                                  {'name': 'floor', 'type': 'INTEGER'},
                                  {'name': 'primary_size', 'type': 'FLOAT'},
                                  {'name': 'total_size', 'type': 'FLOAT'},
                                  {'name': 'price', 'type': 'FLOAT'},
                                  {'name': 'ad_id', 'type': 'INTEGER'},]

In [59]:
sub_properties_parsed.to_gbq(destination_table='housing_data.expanded_sub_properties',
                             project_id='hde-test-clean',
                             table_schema=expanded_sub_properties_schema,
                             if_exists='append')