In [1]:
import re
import os
import yaml
import json
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [2]:
config  = yaml.safe_load(open('config.yaml', 'r'))

In [3]:
query_dir = 'queries'

In [4]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [5]:
query_client = bigquery.Client()

In [6]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [7]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.expanded_sub_properties', query_client):
    request = queries['all_new_buildings_that_have_not_been_expanded.sql']
else:
    request = queries['all_new_buildings.sql']

In [8]:
real_estate_raw = query_client.query(request).to_dataframe()

In [9]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 5 columns):
ad_id             132 non-null int64
new_building      132 non-null bool
property_table    132 non-null object
property_attrs    132 non-null object
address           132 non-null object
dtypes: bool(1), int64(1), object(3)
memory usage: 4.3+ KB


In [10]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [11]:
real_estate_raw.head()

Unnamed: 0_level_0,new_building,property_table,property_attrs,address
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
78653360,True,"{""Bolig"": [""A04-1"", ""A05-1"", ""B05-1"", ""C04-1"",...","{""Omkostninger"": ""20\u00a0406 \u2013 22\u00a01...","Strandvegen 1, 2005 Rælingen"
78866228,True,"{""Bolig"": [""C2201"", ""D1202"", ""D1301"", ""D2401"",...","{""Fellesgjeld"": ""0 kr"", ""Areal"": ""70\u00a0-\u0...","FLÅTESTADVEIEN 3, 1415 Oppegård"
78866228,True,"{""Bolig"": [""C2201"", ""D1202"", ""D1301"", ""D2401"",...","{""Fellesgjeld"": ""0 kr"", ""Areal"": ""70\u00a0-\u0...","FLÅTESTADVEIEN 3, 1415 Oppegård"
82173385,True,"{""Bolig"": [""2012"", ""2015"", ""2016"", ""2017"", ""30...","{""Fellesgjeld"": ""3\u00a0200\u00a0000 \u2013 4\...","Røakollen - Aslakveien 20 - hus B, 0753 Oslo"
84047772,True,"{""Bolig"": [""1001"", ""3006"", ""4006"", ""4008"", ""50...","{""Fellesgjeld"": ""3\u00a0575\u00a0000 \u2013 4\...","Røakollen - Aslakveien 20 - Hus A, 0753 Oslo"


In [12]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [13]:
def expanand_property_tables(new_buildings):
    cols = new_buildings.iloc[0]['property_table'].keys()
    col_map = {'Bolig': 'apt_id',
               'Soverom': 'num_bedrooms',
               'Etg': 'floor',
               'P-rom': 'primary_size',
               'BRA': 'total_size',
               'Totalpris': 'price',
               'Pris': 'price'}
    property_tables = pd.DataFrame(columns=cols)
    for code, row in new_buildings['property_table'].iteritems():
        all_properties = [arr for arr in row.values()]
        to_append = dict(zip(cols, all_properties))
        frame = pd.DataFrame(to_append, columns=cols)
        frame['ad_id'] = code
        property_tables = property_tables.append(frame, ignore_index=True, sort=False)
    property_tables.rename(columns=col_map, inplace=True)
    return property_tables

In [14]:
sub_properties = expanand_property_tables(real_estate_raw[(real_estate_raw.new_building == True) & (real_estate_raw.property_table)])

In [15]:
sub_properties.head()

Unnamed: 0,apt_id,num_bedrooms,floor,primary_size,total_size,price,ad_id
0,A04-1,3,4.0,83 m²,88 m²,5 810 406 kr,78653360.0
1,A05-1,3,5.0,83 m²,88 m²,5 910 406 kr,78653360.0
2,B05-1,3,5.0,92 m²,96 m²,5 912 150 kr,78653360.0
3,C04-1,3,4.0,91 m²,95 m²,5 811 932 kr,78653360.0
4,C05-1,3,5.0,91 m²,95 m²,5 911 932 kr,78653360.0


In [16]:
def get_int_from_str(string):
    try:
        concat_string = ''.join(re.findall(r'[\d.]', string))
        if concat_string != '':
            return float(concat_string)
    except Exception as e:
        print(e, string)
        pass

In [17]:
sub_properties_parsed = sub_properties.copy()
for col in ['primary_size', 'floor', 'price', 'num_bedrooms', 'total_size']:
    sub_properties_parsed[col] = [get_int_from_str(s) for s in sub_properties_parsed[col]]

expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like object None
expected string or bytes-like obje

In [18]:
sub_properties_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1238 entries, 0 to 1237
Data columns (total 7 columns):
apt_id          1238 non-null object
num_bedrooms    1167 non-null float64
floor           1028 non-null float64
primary_size    1130 non-null float64
total_size      1194 non-null float64
price           1194 non-null float64
ad_id           1238 non-null float64
dtypes: float64(6), object(1)
memory usage: 67.8+ KB


In [19]:
sub_properties_parsed.head()

Unnamed: 0,apt_id,num_bedrooms,floor,primary_size,total_size,price,ad_id
0,A04-1,3.0,4.0,83.0,88.0,5810406.0,78653360.0
1,A05-1,3.0,5.0,83.0,88.0,5910406.0,78653360.0
2,B05-1,3.0,5.0,92.0,96.0,5912150.0,78653360.0
3,C04-1,3.0,4.0,91.0,95.0,5811932.0,78653360.0
4,C05-1,3.0,5.0,91.0,95.0,5911932.0,78653360.0


In [20]:
sub_properties_parsed.drop_duplicates(inplace=True)

In [21]:
sub_properties_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1138 entries, 0 to 1237
Data columns (total 7 columns):
apt_id          1138 non-null object
num_bedrooms    1068 non-null float64
floor           950 non-null float64
primary_size    1031 non-null float64
total_size      1094 non-null float64
price           1094 non-null float64
ad_id           1138 non-null float64
dtypes: float64(6), object(1)
memory usage: 71.1+ KB


In [22]:
sub_properties_parsed.to_gbq(destination_table='housing_data.expanded_sub_properties',
                             project_id='hde-test-clean',
                             if_exists='append')