In [22]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium import plugins

In [25]:
from sklearn.preprocessing import MinMaxScaler

In [2]:
#set the visual style
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10000)
from IPython.display import display, HTML
%matplotlib inline
sns.set(style='dark')
plt.rcParams['figure.figsize'] = [14, 10]

In [3]:
query_dir = 'queries'

In [4]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [5]:
query_client = bigquery.Client()

In [6]:
request = queries['combined_analysis_data.sql']

In [7]:
data = query_client.query(request).to_dataframe()

In [8]:
energy_cols = ['energy_letter', 'energy_color']

for col in energy_cols:
    data[col] = 0

energy_res = [[c.strip() for c in char.split('-')] if char is not None else [None, None] for char in data.energy_character]
data[energy_cols] = pd.DataFrame(energy_res, columns=energy_cols)

In [26]:
data['price_per_sq_m'] = data.price / data.primary_size

In [33]:
scaler = MinMaxScaler()
data['price_per_sq_m_scaled'] = scaler.fit_transform(data['price_per_sq_m'].to_frame())

In [13]:
# data.set_index(['ad_id', 'apt_id'], drop=True)

In [34]:
data.head()

Unnamed: 0,ad_id,apt_id,new_building,num_bedrooms,floor,primary_size,total_size,price,property_type,ownership_type,construction_year,energy_character,common_expenses,common_wealth,common_debt,time_s,lat,lng,address,short_description,full_description,energy_letter,energy_color,price_per_sq_m,price_per_sq_m_scaled
0,85287718,120,True,2.0,1.0,48.0,59.0,4150000.0,Leilighet,Eier (Selveier),2019.0,,,,,1262.0,59.916908,10.801887,"Innspurten 6, 06...",Hovinenga Hus C ...,Ansvarlig megler...,,,86458.333333,0.462035
1,85287718,143,True,2.0,2.0,56.0,61.0,4350000.0,Leilighet,Eier (Selveier),2019.0,,,,,1262.0,59.916908,10.801887,"Innspurten 6, 06...",Hovinenga Hus C ...,Ansvarlig megler...,,,77678.571429,0.40765
2,85287718,183,True,2.0,5.0,51.0,59.0,4350000.0,Leilighet,Eier (Selveier),2019.0,,,,,1262.0,59.916908,10.801887,"Innspurten 6, 06...",Hovinenga Hus C ...,Ansvarlig megler...,,,85294.117647,0.454823
3,85287718,187,True,2.0,5.0,48.0,59.0,4350000.0,Leilighet,Eier (Selveier),2019.0,,,,,1262.0,59.916908,10.801887,"Innspurten 6, 06...",Hovinenga Hus C ...,Ansvarlig megler...,,,90625.0,0.487844
4,85287718,204,True,3.0,6.0,86.0,92.0,8495000.0,Leilighet,Eier (Selveier),2019.0,,,,,1262.0,59.916908,10.801887,"Innspurten 6, 06...",Hovinenga Hus C ...,Ansvarlig megler...,,,98779.069767,0.538354


# Geographic Distribution

In [36]:
sample_size = 1300
to_plot = data[(data.lat.notnull()) & (data.price_per_sq_m.notnull())].sample(sample_size)

m = folium.Map([59.9116, 10.7545], zoom_start=11)
#mark each station as a point
for index, row in to_plot.iterrows():
    folium.CircleMarker([row['lat'], row['lng']],
                        radius=1,
                        popup=(row['lat'], row['lng']),
                        fill_color="#3db7e4", # divvy color
                       ).add_to(m)
#convert to (n, 2) nd-array format for heatmap
properties_array = to_plot[['lat', 'lng', 'price_per_sq_m_scaled']].values

#plot heatmap
m.add_child(plugins.HeatMap(properties_array, radius=15))
m