In [None]:
import numpy as np 
import pandas as pd

import geopandas as gpd

import folium
from folium import Choropleth, Circle, Marker, Icon, Map, GeoJson
from folium.plugins import HeatMap, MarkerCluster

pd.set_option("display.max_columns", None)    

We'll import three files:
* 'address' contains latitude and longitude values for the original houses_Madrid file.
* 'data' is the original houses_Madrid file.
* 'houses' contains the cleaned version of houses_Madrid file.

In [None]:
address = pd.read_csv('../input/address3/address3.csv', sep=',', header=0, index_col=0)
data = pd.read_csv('../input/madrid-real-estate-market/houses_Madrid.csv', sep=',', header=0, index_col=0)
houses = pd.read_csv('../input/housesclean/madrid_houses_clean.csv', sep=',', header=0, index_col=0)

In [None]:
print(address.shape, data.shape, houses.shape)

# Preparing the data

We merge 'address' and 'houses' by their 'id' columns keeping only the rows from 'address'.

In [None]:
address_map = pd.merge(address, houses, how='left', on='id')
address_map.head()

In [None]:
address_map.rename(columns={'latitude_x':'latitude', 'longitude_x': 'longitude'}, inplace=True)

Now, we merge the resulting file, 'address_map', with the original one, 'data' by their attribute 'id', keeping only the rows in 'address_map'. 

When a column is in both files, we assign a suffix '\_y' to the column in 'data'. 

In [None]:
geo_data = pd.merge(address_map, data, how='left', on='id', suffixes=('', '_y'))
geo_data.head()

We eliminate the duplicated columns that we marked with '\_y'.

The final file, geo_data, has the rows with latitude and longitude coordinates from 'address', the cleaned columns from 'houses' and the rest of the columns from 'data'.

In [None]:
geo_data.drop(list(geo_data.filter(regex = '_y$')), axis=1, inplace=True)
geo_data.shape

However, we have to do some additional cleaning to the columns from 'data'. Like we did in the notebook 'Madrid houses'.

First, we drop all the columns with only NaN.

In [None]:
geo_data.dropna(axis=1, how='all', inplace=True)

In [None]:
geo_data.neighborhood_id.iloc[0]

The column 'neighborhood_id' has several interesting data. In the previous notebook we extracted the neighbors' number and district.

Here, we want the mean price by neighborhood to be able to compare it with houses' prices.

In [None]:
geo_data['mean_buy_price'] = geo_data.neighborhood_id.str.extract('(\d+\.\d+)\s€')
geo_data = geo_data.astype({'mean_buy_price': 'float64'})

Let's see the columns.

In [None]:
geo_data.info()

There are several columns we won't need.

In [None]:
geo_data.drop(columns=['sq_mt_useful', 'is_exact_address_hidden', 'street_name', 'street_number', 'is_floor_under', 'neighborhood_id', 'operation', 
                      'is_rent_price_known', 'is_buy_price_known','house_type_id', 'is_accessible', 'is_parking_included_in_price', 'parking_price'], inplace=True)

In [None]:
geo_data.notnull().sum()

We had eliminated several of these columns because they have to many missing values to fill. But here, we may use them to show houses' attributes.

In [None]:
geo_data.head()

# Creating the maps

<a href="https://python-visualization.github.io/folium/">Folium</a> is a library that allows us to manipulate data in python and visualize it on a map with the javascript library *leaflet.js*

We can select different tiles (Openstreetmap is by default), different types of markers, colors, several types of maps,...

In [None]:
map_1 = Map(location=[40.32,-3.69], tiles='cartodbpositron', zoom_start=10)
#map_1

When we don't have many rows, we can show directly the markers. Let's see an example with houses that have a garden.

In [None]:
garden_houses = geo_data[geo_data.has_garden.notnull()]
garden_houses.shape

We can show the address by clicking on the marker. We can change the color and add other features.

For example, let's divide this group between those with garden and pool and those without a pool.

In [None]:
for idx, row in garden_houses.iterrows():
    if(row.has_pool == True):
        Marker([row.latitude, row.longitude], popup=row.raw_address ,icon = Icon(color='green',icon='ok-sign'),tooltip='Click me!').add_to(map_1)
    else:
        Marker([row.latitude, row.longitude], popup=row.raw_address,icon = Icon(color='red',icon='exclamation-sign'),tooltip='Click me!').add_to(map_1)
    

In [None]:
map_1

The ones with garden and pool are in green, while the ones without a pool are in red. Let's see if this houses are the detached ones.

In [None]:
casas = geo_data[geo_data.house_type == 2]
casas.shape

In [None]:
HeatMap(data=casas[['latitude', 'longitude']], radius=10).add_to(map_1)

map_1

Yes! With a few exceptions these houses are the detached ones.

When we have many points to place in a map, it may be useful to cluster them together.

Let's select a group of houses under 180000€ that don't need any kind of repairs.

In [None]:
cheapest_flats = geo_data[(geo_data.buy_price < 180000) & (geo_data.is_renewal_needed == False)]
cheapest_flats.shape[0]

We'll mark houses with an energy certificate's high value with a green icon, over 3. The rest with a red icon.

In [None]:
map_2 = Map(location=[40.32,-3.69], tiles='openstreetmap', zoom_start=13)

mc = MarkerCluster()

for idx, row in cheapest_flats.iterrows():
    if row.energy_certificate > 3:
        mc.add_child(Marker([row['latitude'], row['longitude']], tooltip="Price: " + str(row['buy_price']) +"€", icon=Icon(color='green')))
    else:
        mc.add_child(Marker([row['latitude'], row['longitude']], tooltip="Price: " + str(row['buy_price']) +"€", icon=Icon(color='red')))
    
map_2.add_child(mc)

map_2

Let's see which ones of these houses' prices are over the mean (red) or under it.

In [None]:
map_3 = folium.Map(location=[40.32,-3.69], tiles='Stamen Toner', zoom_start=10)

def color_producer(mean_zone, mean_flat):
    if mean_zone <= mean_flat:
        return 'forestgreen'
    else:
        return 'darkred'

# Add a bubble map to the base map
for idx, row in cheapest_flats.iterrows():
    percentage = round((row['buy_price_by_area'] - row['mean_buy_price'])/row['mean_buy_price']*100,2)
    Circle(
        location=[row['latitude'], row['longitude']], popup=("id: {} {}%".format(int(row.id), percentage)),
        radius=abs(percentage)*2.5,
        color=color_producer(row['mean_buy_price'], row['buy_price_by_area'])).add_to(map_3)

map_3


Clicking on a bubble gives the percentage over the mean (negative value) or under the mean (positive value).

The size of the bubble indicates how big the percentage is.

Let's see how houses are distributed all over Madrid. Color shows the type.

In [None]:
map_4 = folium.Map(location=[40.32,-3.69], tiles='Stamen Toner', zoom_start=10)

def color_producer(house_type):
    if house_type == 1: #flats
        return 'forestgreen' 
    elif house_type == 2: #houses
        return 'darkred'
    elif house_type == 3: #studies
        return 'darkblue'
    elif house_type == 4: #duplex
        return 'yellow'
    else:
        return 'Fuchsia' #top floor

for idx, row in geo_data.iterrows():
    percentage = round((row['buy_price_by_area'] - row['mean_buy_price'])/row['mean_buy_price']*100,2)
    Circle(
        location=[row['latitude'], row['longitude']], popup=("size: {} price: {}".format(int(row.sq_mt_built), row.buy_price)),
        radius=row.buy_price/10000,
        color=color_producer(row.house_type)).add_to(map_4)

map_4


We can see that houses are to the north of Madrid, while flats cover the rest.

Size of the circles shows price, with houses and top floors as the most expensive.

# Adding more data

Let's say that we are moving to Madrid and we have children. There are many schools in Madrid with bilingual programs (normally spanish/english). We can place those schools in a map to find houses near them.

We'll get the data from the official 'Comunidad de Madrid' website <a href="https://www.comunidad.madrid/gobierno/datos-abiertos">Datos abiertos</a>.

In [None]:
schools = pd.read_csv('../input/opcioneslinguisticas/opciones_linguisticas.csv', sep=';', encoding='latin1')
schools.head()

Let's focus on high schools with code 42.

In [None]:
schools = schools[schools.centro_tipo_codigo == 42]
schools = schools[['centro_codigo', 'centro_nombre', 'direccion_coor_x', 'direccion_coor_y', 'bilingue_idioma']]
schools.shape

In [None]:
schools.isnull().sum()

In [None]:
schools.dropna(subset=['bilingue_idioma'], inplace=True)
schools.reset_index(drop=True, inplace=True)
schools.shape

In [None]:
schools_map = gpd.GeoDataFrame(schools, geometry=gpd.points_from_xy(schools.direccion_coor_x, schools.direccion_coor_y))
schools_map.crs = {'init': 'epsg:32630'}
schools_map.head()

We'll create a 2 km radius around each high school.

In [None]:
two_km_buffer = schools_map.geometry.buffer(2*1000)
two_km_buffer.head()

We change the reference and we calculate the coordinates in the new system.

In [None]:
schools_map2 = schools_map.to_crs(epsg=4326)
schools_map2.head()

In [None]:
map_5 = Map(location=[40.32,-3.69], tiles='openstreetmap', zoom_start=10)

mc = MarkerCluster()

for idx, row in schools_map2.iterrows():
    Marker([row.geometry.y, row.geometry.x], tooltip=row.centro_nombre ,icon=Icon(color='green', icon='info-sign')).add_to(mc)


for idx, row in cheapest_flats.iterrows():
    Marker([row['latitude'], row['longitude']], popup=("Price: {} Size: {} m² Rooms: {}").format(row['buy_price'], row['sq_mt_built'], row['n_rooms'])).add_to(mc)
    
map_5.add_child(mc)

GeoJson(two_km_buffer.to_crs(epsg=4326)).add_to(map_5)
    
map_5

We can do the same thing but with covid19 statistics. Plotting the cumulative incidence rate per county with our houses.

In [None]:
madrid_map = gpd.read_file('../input/municipios/municipios_y_distritos_madrid.shp', encoding='latin1')
madrid_map.set_index('nombre', inplace=True)
madrid_map.head()

In [None]:
covid = pd.read_csv('../input/covid19/covid19_2.csv', sep=';', header=0, index_col=1)
covid.head()

And, finally, we can plot all our houses on a covid19 Madrid's map and check how spread the incidence rate is at march 9 2021.

In [None]:
map_6 = Map(location=[40.32,-3.69], tiles='cartodbpositron', zoom_start=10)

Choropleth(geo_data=madrid_map.geometry , 
           data=covid['tasa_incidencia_acumulada_total'], 
           key_on="feature.id", 
           fill_color='BuPu', 
           legend_name='Cumulative incidence rate in Madrid'
          ).add_to(map_6)

for idx, row in geo_data.iterrows():
    Circle([row.latitude, row.longitude], popup=row.raw_address, radius=20, color='green').add_to(map_6)
    

map_6