# HK-2022 cross section dataset

## Data import and drop unnecessary rows

In [92]:
import pandas as pd
import matplotlib.pyplot as plt

hk_2022 = pd.read_csv('./Data/HK_2022.csv', dtype={'plz': object, 'kid2019': object})

df = pd.DataFrame(hk_2022)

df['adat_year'] = df['adat'].astype(str).str[:4]

number_of_transactions_per_year = df.groupby('adat_year')['price_sqm'].count()
print(number_of_transactions_per_year)

object_features = [ 'ausstattung', 'badezimmer', 'denkmalobjekt', 'einbaukueche', 'ferienhaus', 'gaestewc', 'garten', 'haustier_erlaubt', 'kategorie_Haus', 'keller', 'parkplatz', 'zimmeranzahl']
general_object_information = ['bauphase', 'einliegerwohnung', 'foerderung', 'immobilientyp', 'kaufvermietet', 'mieteinnahmenpromonat', 'nebenraeume', 'rollstuhlgerecht', 'schlafzimmer']
area_information = ['grundstuecksflaeche', 'nutzflaeche', 'wohnflaeche']
energy_and_structure_information = ['baujahr', 'energieausweistyp', 'energieeffizienzklasse', 'ev_kennwert', 'ev_wwenthalten', 'heizkosten', 'heizungsart', 'letzte_modernisierung', 'objektzustand']
price_information = ['parkplatzpreis']
regional_information = ['gid2019', 'kid2019', 'ergg_1km', 'blid']
meta_information_advertisement = ['click_customer', 'click_schnellkontakte', 'click_url', 'click_weitersagen', 'hits', 'laufzeittage', 'hits_gen']
technical_variables = ['bef1', 'bef2', 'bef3', 'bef4', 'bef5', 'bef6', 'bef7', 'bef8', 'bef9', 'bef10', 'anbieter', 'duplicateid', 'click_schnellkontakte_gen', 'click_weitersagen_gen', 'click_url_gen']
other_variables = ['liste_show', 'liste_match', 'liste_show_gen', 'liste_match_gen']
columns_to_drop = object_features + general_object_information + area_information + energy_and_structure_information + price_information + regional_information + meta_information_advertisement + technical_variables + other_variables
df.drop(columns=columns_to_drop, axis=1, inplace=True)

df = df[df['adat_year'] == '2022']

df.head

<bound method NDFrame.head of              obid            plz  kaufpreis mieteinnahmenpromonat  \
0       130196929  Other missing   279000.0         Other missing   
1       132220614           7639   980000.0         Other missing   
2       131746727          67480   572000.0         Other missing   
3       133856230  Other missing   275000.0         Other missing   
4       131486960          57550   349000.0         Other missing   
...           ...            ...        ...                   ...   
259342  131245864           4626   160000.0         Other missing   
259343  129729448           4626   211449.0         Other missing   
259344  130328229           4626   256749.0         Other missing   
259345  135754405           4617   375000.0         Other missing   
259346  130677810           4617   312114.0         Other missing   

                           heizkosten  baujahr letzte_modernisierung  \
0       Variable for other types only   2001.0         Other missing 

## Merge locational information

In [91]:
plz_city = pd.read_csv('./Data/plz_einwohner.csv', dtype={'plz': str})
df_plz_city = pd.DataFrame(plz_city)
#plz_city.drop('osm_id', axis=1, inplace=True)
df_plz_city.head()
#plz_city = pd.read_csv('./Data/zuordnung_plz_ort.csv', dtype={'plz': str})
#plz_city.drop('osm_id', axis=1, inplace=True)

#df = pd.merge(df, df_plz_city, how="left", on="plz")
#df.head()

#df_new = df.groupby('bundesland')['kaufpreis'].mean().reset_index()
#df_new
#count_by_ort = df_big_cities.groupby('ort')['obid'].count().sort_values(ascending=False).reset_index(name='transactions')


""" plz_einwohner_df = pd.read_csv('../Data/plz_einwohner.csv', sep=',', dtype={'plz': str, 'einwohner': int})
einwohner_bundesland_df = pd.merge(df_plz_city, plz_einwohner_df, how="left", on="plz")
einwohner_bundesland_df = einwohner_bundesland_df.groupby('bundesland')['einwohner'].sum()

df = pd.merge(df, einwohner_bundesland_df, how="inner", on="bundesland")
df """

' plz_einwohner_df = pd.read_csv(\'../Data/plz_einwohner.csv\', sep=\',\', dtype={\'plz\': str, \'einwohner\': int})\neinwohner_bundesland_df = pd.merge(df_plz_city, plz_einwohner_df, how="left", on="plz")\neinwohner_bundesland_df = einwohner_bundesland_df.groupby(\'bundesland\')[\'einwohner\'].sum()\n\ndf = pd.merge(df, einwohner_bundesland_df, how="inner", on="bundesland")\ndf '

## Merge unemployment and income

In [61]:
unemployment_rate = pd.read_csv('../Data/Macroeconomic Data States/Unemployment-rate.csv')
df_unemployment_rate = pd.DataFrame(unemployment_rate)
df_unemployment_rate.rename(columns={'Bundesland': 'bundesland'}, inplace=True)

df = pd.merge(df, df_unemployment_rate, how="inner", on="bundesland")

income = pd.read_csv('../Data/Macroeconomic Data States/Income.csv')
df_income = pd.DataFrame(income)
df_income.rename(columns={'Bundesland': 'bundesland'}, inplace=True)

df = pd.merge(df, df_income, how="inner", on="bundesland")

print(df.shape)
df

(15, 5)


Unnamed: 0,bundesland,kaufpreis,einwohner,Arbeitslosenquote,Einkommen
0,Baden-Württemberg,613140.892695,11504792,3.5,47962
1,Bayern,673879.378869,14181130,3.1,46757
2,Berlin,846280.363123,3291932,8.8,43179
3,Brandenburg,513224.773693,5859910,5.6,36607
4,Bremen,459263.750446,666066,10.2,43434


In [55]:
from IPython.display import HTML, display
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

In [58]:
housing_model = ols("""kaufpreis ~  
                                            einwohner
                                            + Arbeitslosenquote
                                            + Einkommen""", data=df).fit()

housing_model_summary = housing_model.summary()
HTML(housing_model_summary.as_html())



0,1,2,3
Dep. Variable:,kaufpreis,R-squared:,0.492
Model:,OLS,Adj. R-squared:,0.353
Method:,Least Squares,F-statistic:,3.544
Date:,"Thu, 20 Apr 2023",Prob (F-statistic):,0.0516
Time:,14:33:27,Log-Likelihood:,-197.65
No. Observations:,15,AIC:,403.3
Df Residuals:,11,BIC:,406.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-7.744e+05,4.76e+05,-1.626,0.132,-1.82e+06,2.74e+05
einwohner,-0.0053,0.009,-0.582,0.573,-0.025,0.015
Arbeitslosenquote,9397.4429,2.71e+04,0.347,0.735,-5.02e+04,6.9e+04
Einkommen,30.3718,9.430,3.221,0.008,9.616,51.128

0,1,2,3
Omnibus:,0.071,Durbin-Watson:,1.991
Prob(Omnibus):,0.965,Jarque-Bera (JB):,0.247
Skew:,0.126,Prob(JB):,0.884
Kurtosis:,2.424,Cond. No.,117000000.0
