In [1]:
import re
import json
import pandas as pd
from google.cloud import bigquery

In [2]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [3]:
%%bigquery
SELECT
    count(*)
FROM `hde-test-clean.housing_data.real_estate_raw`

Unnamed: 0,f0_
0,849


In [4]:
query_client = bigquery.Client()

In [5]:
request = """
SELECT
    *
FROM
    `hde-test-clean.housing_data.real_estate_raw`
"""
real_estate_raw = query_client.query(request).to_dataframe()

In [6]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849 entries, 0 to 848
Data columns (total 11 columns):
ad_id                849 non-null int64
ad_url               849 non-null object
new_building         849 non-null bool
short_description    849 non-null object
full_description     849 non-null object
property_table       849 non-null object
main_price           849 non-null object
property_attrs       849 non-null object
address              849 non-null object
datetime_viewed      849 non-null datetime64[ns]
datetime_offset      849 non-null object
dtypes: bool(1), datetime64[ns](1), int64(1), object(8)
memory usage: 67.2+ KB


In [7]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [8]:
real_estate_raw.head()

Unnamed: 0_level_0,ad_url,new_building,short_description,full_description,property_table,main_price,property_attrs,address,datetime_viewed,datetime_offset
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
130898040,https://www.finn.no/realestate/homes/ad.html?f...,False,Sentral og moderne 3-roms selveierleilighet me...,Kort om leiligheten:- MODERNE LEILIGHET FRA 20...,{},5 200 000 kr,"{""Omkostninger"": ""138\u00a0722 kr"", ""Totalpris...","Fredtunveien 8, 1386 Asker",2019-06-09 10:36:02,UTC
126578832,https://www.finn.no/realestate/newbuildings/ad...,True,SENNERUDTOPPEN - ARBEIDENE ER I GANG! 87 lyse ...,FremdriftArbeidene er i gang! Ta kontakt for t...,"{""Bolig"": [""1-207"", ""1-307"", ""1-308"", ""1-407"",...",2 790 000 kr7 890 000 kr,"{""Areal"": ""45\u00a0-\u00a0125 m\u00b2"", ""Sover...","Sennerudtoppen Bolig AS, 1920 Sørumsand",2019-06-09 10:30:23,UTC
135914307,https://www.finn.no/realestate/newbuildings/ad...,True,Dovrekvartalet - Spennende boligprosjekt for v...,Prosjektets beskrivelseUtbyggers visjonFor å s...,"{""Bolig"": [""101"", ""102"", ""103"", ""104"", ""105"", ...",1 600 000 kr4 900 000 kr,"{""Fellesgjeld"": ""1\u00a0600\u00a0000 \u2013 4\...","Landstadsgate 13 - D5, 2000 Lillestrøm",2019-06-09 10:39:41,UTC
142183291,https://www.finn.no/realestate/homes/ad.html?f...,False,Tiltalende enebolig fra 2015. Solrik terrasse ...,Tiltalende og moderne enebolig med svært etter...,{},14 775 000 kr,"{""Omkostninger"": ""384\u00a0520 kr"", ""Totalpris...","Bjerkelundsveien 4A, 1358 Jar",2019-06-09 10:41:33,UTC
149524358,https://www.finn.no/realestate/homes/ad.html?f...,False,Stor og påkostet enebolig over 2 plan fra 2013...,DNB Eiendom v/Christer Langstrand har gleden a...,{},6 390 000 kr,"{""Omkostninger"": ""160\u00a0972 kr"", ""Totalpris...","Kantarellvegen 14, 2016 Frogner",2019-06-09 10:32:42,UTC


In [9]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [10]:
def expanand_property_tables(new_buildings):
    cols = new_buildings.iloc[0]['property_table'].keys()
    col_map = {'Bolig': 'apt_id',
               'Soverom': 'num_bedrooms',
               'Etg': 'floor',
               'P-rom': 'primary_size',
               'BRA': 'total_size',
               'Pris': 'price'}
    property_tables = pd.DataFrame(columns=cols)
    for code, row in new_buildings['property_table'].iteritems():
        all_properties = [arr for arr in row.values()]
        to_append = dict(zip(cols, all_properties))
        frame = pd.DataFrame(to_append, columns=cols)
        frame['ad_id'] = code
        property_tables = property_tables.append(frame, ignore_index=True, sort=False)
    property_tables.rename(columns=col_map, inplace=True)
    return property_tables

In [11]:
sub_properties = expanand_property_tables(real_estate_raw[(real_estate_raw.new_building == True) & (real_estate_raw.property_table)])

In [12]:
sub_properties.head()

Unnamed: 0,num_bedrooms,primary_size,floor,total_size,price,apt_id,ad_id
0,2,87 m²,2.0,91 m²,4 990 000 kr,1-207,126578832.0
1,2,87 m²,3.0,91 m²,5 090 000 kr,1-307,126578832.0
2,3,70 m²,3.0,74 m²,4 290 000 kr,1-308,126578832.0
3,2,87 m²,3.0,91 m²,5 190 000 kr,1-407,126578832.0
4,2,56 m²,5.0,60 m²,3 490 000 kr,1-506,126578832.0


In [13]:
def get_int_from_str(string):
    try:
        concat_string = ''.join(re.findall(r'\d', string))
        if concat_string != '':
            return int(concat_string)
    except Exception as e:
        print(e)
        pass

In [14]:
sub_properties_parsed = sub_properties.copy()
for col in ['primary_size', 'floor', 'price', 'num_bedrooms', 'total_size']:
    sub_properties_parsed[col] = [get_int_from_str(s) for s in sub_properties_parsed[col]]

expected string or bytes-like object


In [15]:
sub_properties_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519 entries, 0 to 518
Data columns (total 7 columns):
num_bedrooms    519 non-null int64
primary_size    443 non-null float64
floor           451 non-null float64
total_size      516 non-null float64
price           518 non-null float64
apt_id          519 non-null object
ad_id           519 non-null float64
dtypes: float64(5), int64(1), object(1)
memory usage: 28.5+ KB


In [16]:
sub_properties_parsed.head()

Unnamed: 0,num_bedrooms,primary_size,floor,total_size,price,apt_id,ad_id
0,2,87.0,2.0,91.0,4990000.0,1-207,126578832.0
1,2,87.0,3.0,91.0,5090000.0,1-307,126578832.0
2,3,70.0,3.0,74.0,4290000.0,1-308,126578832.0
3,2,87.0,3.0,91.0,5190000.0,1-407,126578832.0
4,2,56.0,5.0,60.0,3490000.0,1-506,126578832.0
