In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from mlxtend.plotting import plot_confusion_matrix
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
immo_data = pd.read_csv("../input/apartment-rental-offers-in-germany/immo_data.csv")
# Source: https://www.kaggle.com/corrieaar/apartment-rental-offers-in-germany

In [None]:
immo_data.head(10)

In [None]:
# A view of all the columns in the database and the amount of non-null data in each
immo_data.info()

In [None]:
# Name of the German states (regio1) in the database
immo_data["regio1"].unique()

In [None]:
# Renaming columns
immo_data.rename(columns = {"regio1": "state", "regio2": "city", "regio3": "locality_district"}, inplace = True)

# Sorting data
immo_data = immo_data.sort_values(by = ["state", "city", "locality_district"]).reset_index(drop = True)

immo_data[["state", "city", "locality_district", "totalRent", "livingSpace","typeOfFlat", "noRooms", "floor"]].head(20)

In [None]:
#Data only for Berlin sorted by regions
immo_data_berlin = immo_data[immo_data["state"] == "Berlin"].reset_index(drop = True)

immo_data_berlin[["city", "locality_district", "totalRent", "livingSpace","typeOfFlat", "noRooms", "floor"]]

In [None]:
# Deleting rows without totalRent data
immo_data_berlin.dropna(subset = ["totalRent"], inplace = True)
immo_data_berlin.reset_index(drop = True, inplace = True)

immo_data_berlin[["city", "totalRent"]]

In [None]:
# Graphing living space against total rent per locality_district in Berlin with seaborn
plt.figure(figsize = (14,10))

sns.scatterplot(x = immo_data_berlin.livingSpace[immo_data_berlin.livingSpace < 200],
                y = immo_data_berlin.totalRent[immo_data_berlin.totalRent < 2500],
                hue = immo_data_berlin.locality_district)

# Setting the legend with the localities to the right
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)

# This graph doesn't provide much relevant info

In [None]:
# Creating data frame with the average (mean) totalRent per locality_district
immo_data_berlin_mean = immo_data_berlin.groupby("locality_district").totalRent.agg("mean")
immo_data_berlin_mean = pd.DataFrame(immo_data_berlin_mean)
immo_data_berlin_mean.sort_values(by = ["totalRent"], inplace = True)

immo_data_berlin_mean

In [None]:
# Graphing each district against average total rent with seaborn
plt.figure(figsize=(14,10))

sns.barplot(x = immo_data_berlin_mean.index, y = immo_data_berlin_mean.totalRent)

plt.xticks(rotation = -90)

In [None]:
# List of all Berlin localities in the database (79), some of which have an incorrect _district
immo_data_berlin.locality_district.unique()

In [None]:
# Associating each locality with its respective district in Berlin for geojson mapping purposes

district = []

for x in range(len(immo_data_berlin.locality_district)):
    if immo_data_berlin.locality_district[x] in ["Charlottenburg_Charlottenburg", "Wilmersdorf_Wilmersdorf", "Schmargendorf_Wilmersdorf", "Grunewald_Wilmersdorf"]:
        district.append("Charlottenburg-Wilmersdorf")
    elif immo_data_berlin.locality_district[x] in ["Friedrichshain_Friedrichshain", "Kreuzberg_Kreuzberg"]:
        district.append("Friedrichshain-Kreuzberg")
    elif immo_data_berlin.locality_district[x] in ["Friedrichsfelde_Lichtenberg", "Karlshorst_Lichtenberg", "Lichtenberg_Lichtenberg", "Falkenberg_Hohenschönhausen", "Malchow_Hohenschönhausen", "Neu_Hohenschönhausen_Hohenschönhausen", "Alt_Hohenschönhausen_Hohenschönhausen", "Rummelsburg_Lichtenberg"]:
        district.append("Lichtenberg")
    elif immo_data_berlin.locality_district[x] in ["Marzahn_Marzahn", "Biesdorf_Marzahn", "Kaulsdorf_Hellersdorf", "Mahlsdorf_Hellersdorf", "Hellersdorf_Hellersdorf"]:
        district.append("Marzahn-Hellersdorf")
    elif immo_data_berlin.locality_district[x] in ["Mitte_Mitte", "Tiergarten_Tiergarten", "Wedding_Wedding"]:
        district.append("Mitte")
    elif immo_data_berlin.locality_district[x] in ["Neukölln_Neukölln", "Britz_Neukölln", "Buckow_Neukölln", "Rudow_Neukölln"]:
        district.append("Neukölln")
    elif immo_data_berlin.locality_district[x] in ["Prenzlauer_Berg_Prenzlauer_Berg", "Weißensee_Weißensee", "Blankenburg_Weißensee", "Heinersdorf_Weißensee", "Karow_Weißensee", "Pankow_Pankow", "Buch_Pankow", "Französisch_Buchholz_Pankow", "Niederschönhausen_Pankow", "Rosenthal_Pankow"]:
        district.append("Pankow")
    elif immo_data_berlin.locality_district[x] in ["Reinickendorf_Reinickendorf", "Tegel_Reinickendorf", "Konradshöhe_Reinickendorf", "Heiligensee_Reinickendorf", "Frohnau_Reinickendorf", "Hermsdorf_Reinickendorf", "Waidmannslust_Reinickendorf", "Lübars_Reinickendorf", "Wittenau_Reinickendorf"]:
        district.append("Reinickendorf")
    elif immo_data_berlin.locality_district[x] in ["Spandau_Spandau", "Haselhorst_Spandau", "Siemensstadt_Spandau", "Staaken_Spandau", "Gatow_Spandau", "Kladow_Spandau"]:
        district.append("Spandau")
    elif immo_data_berlin.locality_district[x] in ["Steglitz_Steglitz", "Lichterfelde_Steglitz", "Lankwitz_Steglitz", "Zehlendorf_Zehlendorf", "Dahlem_Zehlendorf", "Nikolassee_Zehlendorf", "Wannsee_Zehlendorf"]:
        district.append("Steglitz-Zehlendorf")
    elif immo_data_berlin.locality_district[x] in ["Schöneberg_Schöneberg", "Friedenau_Schöneberg", "Tempelhof_Tempelhof", "Mariendorf_Tempelhof", "Marienfelde_Tempelhof", "Lichtenrade_Tempelhof"]:
        district.append("Tempelhof-Schöneberg")
    elif immo_data_berlin.locality_district[x] in ["Treptow_Treptow", "Plänterwald_Treptow", "Baumschulenweg_Treptow", "Johannisthal_Treptow", "Niederschöneweide_Treptow", "Altglienicke_Treptow", "Adlershof_Treptow", "Bohnsdorf_Treptow", "Oberschöneweide_Köpenick", "Köpenick_Köpenick", "Friedrichshagen_Köpenick", "Rahnsdorf_Köpenick", "Grünau_Köpenick", "Müggelheim_Köpenick", "Schmöckwitz_Köpenick"]:
        district.append("Treptow-Köpenick")

immo_data_berlin["district"] = district
immo_data_berlin[["city", "district", "totalRent", "livingSpace","typeOfFlat", "noRooms", "floor"]]

In [None]:
# Determining the rent per square meter for each flat
immo_data_berlin["rentPerMet2"] = immo_data_berlin.totalRent / immo_data_berlin.livingSpace

immo_data_berlin[["city", "district", "totalRent", "livingSpace", "rentPerMet2"]]

In [None]:
# Graphing district against price per squared meter with seaborn
plt.figure(figsize=(14,10))

sns.barplot(x = immo_data_berlin.district, y = immo_data_berlin.rentPerMet2)

# Establishing the angle at which the name of each district will show up in the graph
plt.xticks(rotation = -45)

In [None]:
# Graphing district against price per squared meter with plotly
# Just for comparison purposes between seaborn and plotly

data = go.Bar(x = immo_data_berlin.district, y = immo_data_berlin.rentPerMet2, 
              marker_color = "red")


layout = go.Layout(bargap = 0,
                  xaxis = dict(title = 'District', gridcolor = 'rgb(183, 183, 183)', showline = True),
                  yaxis = dict(title = 'Rent per m^2', gridcolor = 'rgb(183, 183, 183)', showline = True),
                  font = dict(family = 'Courier New, monospace', size = 12, color = 'rgb(0, 0, 0)'),
                  legend = dict(x = 0, y = 1.0, bgcolor='rgba(255, 255, 255, 0)', bordercolor = 'rgba(255, 255, 255, 0)'))

fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [None]:
# Creating data frame with average totalRent per district for geojson
immo_data_berlin_disMean = immo_data_berlin.groupby("district").rentPerMet2.agg("mean")
immo_data_berlin_disMean = pd.DataFrame(immo_data_berlin_disMean)
immo_data_berlin_disMean.sort_values(by = ["rentPerMet2"], inplace = True)

immo_data_berlin_disMean

In [None]:
# Graphing district against avg price per squared meter in Berlin map with plotly using geojson data
import json

with open("../input/berlin-bezirke/berlin_bezirke.geojson") as repo_url:
    ger_regions_geo = json.load(repo_url)

#repo_url = "https://raw.githubusercontent.com/funkeinteraktiv/Berlin-Geodaten/master/berlin_bezirke.geojson" #geojson raw data file
#ger_regions_geo = requests.get(repo_url).json()

fig = px.choropleth(data_frame = immo_data_berlin_disMean, 
                    geojson = ger_regions_geo, 
                    locations = immo_data_berlin_disMean.index, #column from the data frame with the Berlin districts
                    featureidkey = 'properties.name',  #location of the name of the districts in the geojson file
                    color = 'rentPerMet2', #column form data frame whose value will set the hue for each district
                    color_continuous_scale = "Greens",
                    scope = "europe",
                    range_color = (10, 25)) #observed range for approx. min and max values

fig.update_geos(showsubunits = True, showcoastlines = True, showland = True, fitbounds = "locations")

fig.update_layout(title_text = 'Berlin Geo-data')

fig.show()

In [None]:
# Creating data frame with number of flats per district in Berlin
pie_data = immo_data_berlin.district.value_counts()
pie_data = pd.DataFrame(pie_data)
pie_data.rename(columns = {"index": "district", "district": "num_of_flats"}, inplace = True)

pie_data

In [None]:
# Graphing percentage of flats that each Berlin district has with pyplot
fig = px.pie(pie_data, values = pie_data.num_of_flats, names = pie_data.index)
fig.update_traces(textposition = 'inside', textinfo = 'percent+label')
fig.update_layout(template = 'plotly_white')

fig.show()

geojson links

with districts: https://raw.githubusercontent.com/funkeinteraktiv/Berlin-Geodaten/master/berlin_bezirke.geojson

with localities: https://raw.githubusercontent.com/ljwolf/geopython/master/data/berlin-neighbourhoods.geojson