In [24]:
import re
import json
import pandas as pd
from google.cloud import bigquery
from sklearn.preprocessing import MinMaxScaler

In [2]:
%load_ext google.cloud.bigquery

In [3]:
%%bigquery
SELECT
    count(*)
FROM `hde-test-clean.housing_data.real_estate_raw`

Unnamed: 0,f0_
0,1281


In [4]:
query_client = bigquery.Client()

In [5]:
request = """
SELECT
    *
FROM
    `hde-test-clean.housing_data.real_estate_raw`
"""
real_estate_raw = query_client.query(request).to_dataframe()

In [6]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 11 columns):
ad_id                1281 non-null int64
ad_url               1281 non-null object
new_building         1281 non-null bool
short_description    1281 non-null object
full_description     1281 non-null object
property_table       1281 non-null object
main_price           1281 non-null object
property_attrs       1281 non-null object
address              1281 non-null object
datetime_viewed      1281 non-null datetime64[ns]
datetime_offset      1281 non-null object
dtypes: bool(1), datetime64[ns](1), int64(1), object(8)
memory usage: 101.4+ KB


In [7]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [8]:
real_estate_raw.head()

Unnamed: 0_level_0,ad_url,new_building,short_description,full_description,property_table,main_price,property_attrs,address,datetime_viewed,datetime_offset
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
130898040,https://www.finn.no/realestate/homes/ad.html?f...,False,Sentral og moderne 3-roms selveierleilighet me...,Kort om leiligheten:- MODERNE LEILIGHET FRA 20...,{},5 200 000 kr,"{""Omkostninger"": ""138\u00a0722 kr"", ""Totalpris...","Fredtunveien 8, 1386 Asker",2019-06-09 10:36:02,UTC
126578832,https://www.finn.no/realestate/newbuildings/ad...,True,SENNERUDTOPPEN - ARBEIDENE ER I GANG! 87 lyse ...,FremdriftArbeidene er i gang! Ta kontakt for t...,"{""Bolig"": [""1-207"", ""1-307"", ""1-308"", ""1-407"",...",2 790 000 kr7 890 000 kr,"{""Areal"": ""45\u00a0-\u00a0125 m\u00b2"", ""Sover...","Sennerudtoppen Bolig AS, 1920 Sørumsand",2019-06-09 10:30:23,UTC
135914307,https://www.finn.no/realestate/newbuildings/ad...,True,Dovrekvartalet - Spennende boligprosjekt for v...,Prosjektets beskrivelseUtbyggers visjonFor å s...,"{""Bolig"": [""101"", ""102"", ""103"", ""104"", ""105"", ...",1 600 000 kr4 900 000 kr,"{""Fellesgjeld"": ""1\u00a0600\u00a0000 \u2013 4\...","Landstadsgate 13 - D5, 2000 Lillestrøm",2019-06-09 10:39:41,UTC
142183291,https://www.finn.no/realestate/homes/ad.html?f...,False,Tiltalende enebolig fra 2015. Solrik terrasse ...,Tiltalende og moderne enebolig med svært etter...,{},14 775 000 kr,"{""Omkostninger"": ""384\u00a0520 kr"", ""Totalpris...","Bjerkelundsveien 4A, 1358 Jar",2019-06-09 10:41:33,UTC
149524358,https://www.finn.no/realestate/homes/ad.html?f...,False,Stor og påkostet enebolig over 2 plan fra 2013...,DNB Eiendom v/Christer Langstrand har gleden a...,{},6 390 000 kr,"{""Omkostninger"": ""160\u00a0972 kr"", ""Totalpris...","Kantarellvegen 14, 2016 Frogner",2019-06-09 10:32:42,UTC


In [9]:
for col in ['property_table', 'property_attrs']:
    real_estate_raw[col] = [json.loads(cell) for cell in real_estate_raw[col]]

In [10]:
headers = []
for id, row in real_estate_raw.property_attrs.items():
    for k in row.keys():
        headers.append(k)

In [11]:
headers = list(set(headers))

In [12]:
to_append = []
for code, row in real_estate_raw.property_attrs.items():
    all_properties = []
    for key in headers:
        all_properties.append(row.get(key))
    to_append.append(all_properties)
building_attributes = pd.DataFrame(to_append, columns=headers)

In [17]:
building_attributes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 42 columns):
Eierskifte-forsikring                       420 non-null object
Verditakst                                  31 non-null object
Sikringsordning                             4 non-null object
Omkostninger                                1165 non-null object
Formuesverdi                                1034 non-null object
Felleskost/mnd. etter avdragsfri periode    4 non-null object
Rom                                         782 non-null object
Ant etasjer                                 26 non-null object
Kommunale avg.                              326 non-null object
Utendørsareal                               1 non-null object
Lånetakst                                   10 non-null object
Fellesformue                                639 non-null object
Telefon                                     9 non-null object
Ferdigstillelse                             3 non-null object
FINN-kod

In [22]:
non_null_counts = building_attributes.notna().sum(axis=0)

In [42]:
scaler = MinMaxScaler()
non_null_counts_scaled = pd.Series(scaler.fit_transform(non_null_counts.values.reshape(-1, 1))[:,0], index=non_null_counts.index)

In [46]:
non_null_counts_scaled.sort_values(ascending=False, inplace=True)

In [56]:
non_null_counts_scaled

Boligtype                                   1.000000
Eieform                                     1.000000
FINN-kode                                   1.000000
Sist endret                                 1.000000
Referanse                                   0.982031
Soverom                                     0.968750
Bruksareal                                  0.927344
Primærrom                                   0.925781
Totalpris                                   0.913281
Omkostninger                                0.909375
Byggeår                                     0.909375
Tomteareal                                  0.889844
Bruttoareal                                 0.832031
Formuesverdi                                0.807031
Energimerking                               0.761719
Felleskost/mnd.                             0.675000
Etasje                                      0.650781
Rom                                         0.610156
Fellesformue                                0.

In [53]:
present_data_thresh = 0.5
top_n = 10

In [54]:
non_null_counts_scaled[non_null_counts_scaled > present_data_thresh]

Boligtype          1.000000
Eieform            1.000000
FINN-kode          1.000000
Sist endret        1.000000
Referanse          0.982031
Soverom            0.968750
Bruksareal         0.927344
Primærrom          0.925781
Totalpris          0.913281
Omkostninger       0.909375
Byggeår            0.909375
Tomteareal         0.889844
Bruttoareal        0.832031
Formuesverdi       0.807031
Energimerking      0.761719
Felleskost/mnd.    0.675000
Etasje             0.650781
Rom                0.610156
dtype: float64

In [55]:
non_null_counts_scaled[:top_n]

Boligtype       1.000000
Eieform         1.000000
FINN-kode       1.000000
Sist endret     1.000000
Referanse       0.982031
Soverom         0.968750
Bruksareal      0.927344
Primærrom       0.925781
Totalpris       0.913281
Omkostninger    0.909375
dtype: float64

In [57]:
non_null_counts_scaled[non_null_counts_scaled > 0.3].index

Index(['Boligtype', 'Eieform', 'FINN-kode', 'Sist endret', 'Referanse',
       'Soverom', 'Bruksareal', 'Primærrom', 'Totalpris', 'Omkostninger',
       'Byggeår', 'Tomteareal', 'Bruttoareal', 'Formuesverdi', 'Energimerking',
       'Felleskost/mnd.', 'Etasje', 'Rom', 'Fellesformue', 'Fellesgjeld',
       'Eierskifte-forsikring'],
      dtype='object')