# HK-2022 cross section dataset

## Data import and drop unnecessary rows

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

hk_2022 = pd.read_csv('./Data/HK_2022.csv', dtype={'plz': object, 'kid2019': object})

df = pd.DataFrame(hk_2022)

df['adat_year'] = df['adat'].astype(str).str[:4]

number_of_transactions_per_year = df.groupby('adat_year')['price_sqm'].count()
print(number_of_transactions_per_year)

object_features = [ 'ausstattung', 'badezimmer', 'denkmalobjekt', 'einbaukueche', 'ferienhaus', 'gaestewc', 'garten', 'haustier_erlaubt', 'kategorie_Haus', 'keller', 'parkplatz', 'zimmeranzahl']
general_object_information = ['bauphase', 'einliegerwohnung', 'foerderung', 'immobilientyp', 'kaufvermietet', 'mieteinnahmenpromonat', 'nebenraeume', 'rollstuhlgerecht', 'schlafzimmer']
area_information = ['grundstuecksflaeche', 'nutzflaeche', 'wohnflaeche']
energy_and_structure_information = ['baujahr', 'energieausweistyp', 'energieeffizienzklasse', 'ev_kennwert', 'ev_wwenthalten', 'heizkosten', 'heizungsart', 'letzte_modernisierung', 'objektzustand']
price_information = ['parkplatzpreis']
regional_information = ['gid2019', 'kid2019', 'ergg_1km', 'blid']
meta_information_advertisement = ['click_customer', 'click_schnellkontakte', 'click_url', 'click_weitersagen', 'hits', 'laufzeittage', 'hits_gen']
technical_variables = ['bef1', 'bef2', 'bef3', 'bef4', 'bef5', 'bef6', 'bef7', 'bef8', 'bef9', 'bef10', 'anbieter', 'duplicateid', 'click_schnellkontakte_gen', 'click_weitersagen_gen', 'click_url_gen']
other_variables = ['liste_show', 'liste_match', 'liste_show_gen', 'liste_match_gen']
columns_to_drop = object_features + general_object_information + area_information + energy_and_structure_information + price_information + regional_information + meta_information_advertisement + technical_variables + other_variables
df.drop(columns=columns_to_drop, axis=1, inplace=True)

df = df[df['adat_year'] == '2022']

df.head

adat_year
2020      2830
2021     22073
2022    234444
Name: price_sqm, dtype: int64


<bound method NDFrame.head of              obid            plz  kaufpreis     adat  price_sqm adat_year
1       132220614           7639   980000.0   2022m9  5444.4443      2022
2       131746727          67480   572000.0   2022m2  2325.2034      2022
3       133856230  Other missing   275000.0   2022m5  1470.5883      2022
4       131486960          57550   349000.0   2022m1  2115.1516      2022
5       133721583  Other missing   399000.0   2022m5  1338.9261      2022
...           ...            ...        ...      ...        ...       ...
259339  131106995           4626   160000.0   2022m1  1280.0000      2022
259340  132807761           4626   429900.0   2022m3  2866.0000      2022
259341  131097714           4626   160000.0   2022m1  1280.0000      2022
259342  131245864           4626   160000.0   2022m1  1280.0000      2022
259345  135754405           4617   375000.0  2022m10  3151.2605      2022

[234444 rows x 6 columns]>

## Map of Germany

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline

plz_shape_df = gpd.read_file('./Data/PLZ-Gebiete/plz-gebiete.shp', dtype={'plz': str})


plz_region_df = pd.read_csv('./Data/zuordnung_plz_ort.csv', sep=',', dtype={'plz': str})
plz_region_df.drop('osm_id', axis=1, inplace=True)

df_locational_info = pd.merge(left=plz_shape_df, right=plz_region_df, on='plz',how='inner')
df_locational_info.drop(['note'], axis=1, inplace=True)

#Number of inhabitants per plz
#plz_einwohner_df = pd.read_csv('../Data/plz_einwohner.csv', sep=',', dtype={'plz': str, 'einwohner': int})
#germany_plz_einwohner_df = pd.merge(left=germany_df, right=plz_einwohner_df, on='plz', how='left')

plt.rcParams['figure.figsize'] = [16, 11]

top_cities = {
    'Hamburg': (9.993682, 53.551086),
    'Hannover': (9.73322, 52.37052),
    'Bremen': (8.7975, 53.1153),
    'Düsseldorf': (6.782048, 51.227144),
    'Essen': (7.0131, 51.4508),
    'Duisburg': (6.7611, 51.4322),
    'Cologne': (6.953101, 50.935173),
    'Dortmund': (7.468554, 51.513400),
    'Frankfurt am Main': (8.682127, 50.110924),
    'Stuttgart': (9.181332, 48.777128),
    'Munich': (11.576124, 48.137154),
    'Nürnberg': (11.077438, 49.449820),
    'Berlin': (13.404954, 52.520008),
    'Dresden': (13.7400, 51.0500),
    'Leipzig': (12.387772, 51.343479), 
}

df_transactions = df.groupby(['plz'])['plz'].count().reset_index(name='transactions')
germany_number_of_transactions_per_plz_df = pd.merge(left=df_locational_info, right=df_transactions, on='plz', how='left')
print(germany_number_of_transactions_per_plz_df.columns)
total = germany_number_of_transactions_per_plz_df['transactions'].sum()
print(total)

fig, ax = plt.subplots()

germany_number_of_transactions_per_plz_df.plot(ax=ax, column='transactions', categorical=False, legend=True, cmap='autumn_r', alpha=0.8)

for c in top_cities.keys():
    ax.text(
        x=top_cities[c][0],
        y=top_cities[c][1] + 0.08,
        s=c,
        fontsize=12,
        ha='center',
    )

    ax.plot(
        top_cities[c][0],
        top_cities[c][1],
        marker='o',
        c='black',
        alpha=0.5
    )
    
ax.set(
    title='Germany: Number of transactions per PLZ',
    aspect=1.3,
    facecolor='lightblue'
) 

ImportError: The 'read_file' function requires the 'pyogrio' or 'fiona' package, but neither is installed or imports correctly.
Importing fiona resulted in: dlopen(/opt/anaconda3/lib/python3.9/site-packages/fiona/ogrext.cpython-39-darwin.so, 0x0002): Library not loaded: @rpath/libpoppler.91.dylib
  Referenced from: <5436D78F-5C14-3A26-8F14-123928F6DCFB> /opt/anaconda3/lib/libgdal.30.dylib
  Reason: tried: '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/python3.9/site-packages/fiona/../../../libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/python3.9/site-packages/fiona/../../../libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/bin/../lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/bin/../lib/libpoppler.91.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS@rpath/libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/python3.9/site-packages/fiona/../../../libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/python3.9/site-packages/fiona/../../../libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/bin/../lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/anaconda3/lib/libpoppler.91.dylib' (no such file), '/opt/anaconda3/bin/../lib/libpoppler.91.dylib' (no such file), '/usr/local/lib/libpoppler.91.dylib' (no such file), '/usr/lib/libpoppler.91.dylib' (no such file, not in dyld cache)
Importing pyogrio resulted in: No module named 'pyogrio'

## Merge locational information

In [15]:
plz_city = pd.read_csv('./Data/zuordnung_plz_ort.csv', dtype={'plz': str})
df_plz_city = pd.DataFrame(plz_city)
plz_city.drop('osm_id', axis=1, inplace=True)

df = pd.merge(df, df_plz_city, how="left", on="plz")

df = df.groupby('bundesland')['kaufpreis'].mean().reset_index()
#count_by_ort = df_big_cities.groupby('ort')['obid'].count().sort_values(ascending=False).reset_index(name='transactions')

plz_einwohner_df = pd.read_csv('./Data/plz_einwohner.csv', sep=',', dtype={'plz': str, 'einwohner': int})
einwohner_bundesland_df = pd.merge(df_plz_city, plz_einwohner_df, how="left", on="plz")
einwohner_bundesland_df = einwohner_bundesland_df.groupby('bundesland')['einwohner'].sum()

df = pd.merge(df, einwohner_bundesland_df, how="inner", on="bundesland")
df 

Unnamed: 0,bundesland,kaufpreis,einwohner
0,Baden-Württemberg,613140.892695,11504792
1,Bayern,673879.378869,14181130
2,Berlin,846280.363123,3291932
3,Brandenburg,513224.773693,5859910
4,Bremen,459263.750446,666066
5,Hamburg,923291.380846,1726063
6,Hessen,601663.485861,6160456
7,Mecklenburg-Vorpommern,398069.191038,6520766
8,Niedersachsen,407151.678214,10664114
9,Nordrhein-Westfalen,519309.665267,17729560


## Merge unemployment and income

In [17]:
unemployment_rate = pd.read_csv('./Data/Macroeconomic Data States/Unemployment-rate.csv')
df_unemployment_rate = pd.DataFrame(unemployment_rate)
df_unemployment_rate.rename(columns={'Bundesland': 'bundesland'}, inplace=True)

df = pd.merge(df, df_unemployment_rate, how="inner", on="bundesland")

income = pd.read_csv('./Data/Macroeconomic Data States/Income.csv')
df_income = pd.DataFrame(income)
df_income.rename(columns={'Bundesland': 'bundesland'}, inplace=True)

df = pd.merge(df, df_income, how="inner", on="bundesland")

print(df.shape)
df

(15, 5)


Unnamed: 0,bundesland,kaufpreis,einwohner,Arbeitslosenquote,Einkommen
0,Baden-Württemberg,613140.892695,11504792,3.5,47962
1,Bayern,673879.378869,14181130,3.1,46757
2,Berlin,846280.363123,3291932,8.8,43179
3,Brandenburg,513224.773693,5859910,5.6,36607
4,Bremen,459263.750446,666066,10.2,43434
5,Hamburg,923291.380846,1726063,6.8,48132
6,Hessen,601663.485861,6160456,4.8,47762
7,Mecklenburg-Vorpommern,398069.191038,6520766,7.3,36191
8,Niedersachsen,407151.678214,10664114,5.3,41924
9,Nordrhein-Westfalen,519309.665267,17729560,6.8,44230


In [18]:
from IPython.display import HTML, display
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

In [19]:
housing_model = ols("""kaufpreis ~  
                                            einwohner
                                            + Arbeitslosenquote
                                            + Einkommen""", data=df).fit()

housing_model_summary = housing_model.summary()
HTML(housing_model_summary.as_html())



0,1,2,3
Dep. Variable:,kaufpreis,R-squared:,0.492
Model:,OLS,Adj. R-squared:,0.353
Method:,Least Squares,F-statistic:,3.544
Date:,"Thu, 20 Apr 2023",Prob (F-statistic):,0.0516
Time:,15:04:44,Log-Likelihood:,-197.65
No. Observations:,15,AIC:,403.3
Df Residuals:,11,BIC:,406.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-7.744e+05,4.76e+05,-1.626,0.132,-1.82e+06,2.74e+05
einwohner,-0.0053,0.009,-0.582,0.573,-0.025,0.015
Arbeitslosenquote,9397.4429,2.71e+04,0.347,0.735,-5.02e+04,6.9e+04
Einkommen,30.3718,9.430,3.221,0.008,9.616,51.128

0,1,2,3
Omnibus:,0.071,Durbin-Watson:,1.991
Prob(Omnibus):,0.965,Jarque-Bera (JB):,0.247
Skew:,0.126,Prob(JB):,0.884
Kurtosis:,2.424,Cond. No.,117000000.0
