In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
df1 = pd.read_csv('Clean_Data_08-06-KK.csv')
df2 = pd.read_csv('Clean_Data_08-06.csv')
df1 = df1.drop(columns=['2015'])
pd.set_option('display.max_columns', None)

In [3]:
# Concatenate the dataframes vertically
df = pd.concat([df1, df2], axis=0, ignore_index=True)
# df

In [5]:
# df[(df["Kommune"] == "København") & (df["Type"] != "Ejerlejlighed")]

In [6]:
# df.to_csv('Clean_08-06.csv', index=False)

In [8]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Filter data for the years 2019 and 2023
df_filtered = df[df['Year'].isin([2019, 2023])]

# Calculate mean Price_per_kvdm for each municipality and type for the years 2019 and 2023
mean_price_per_kvdm = df_filtered.groupby(['Kommune', 'Year', 'Type'])['Price_per_kvdm'].mean().reset_index()

# Calculate percentage change in Price_per_kvdm
mean_price_per_kvdm['Procent_change'] = mean_price_per_kvdm.groupby(['Kommune', 'Type'])['Price_per_kvdm'].pct_change() * 100

# Filter only the rows for the year 2023 to get the percentage change
mean_price_per_kvdm_2023 = mean_price_per_kvdm[mean_price_per_kvdm['Year'] == 2023]

# Calculate mean Distance_to_rådhus for each municipality and type (use the whole period to keep it constant)
mean_distance = df_filtered.groupby(['Kommune', 'Type'])['Distance_to_rådhus'].mean().reset_index()

# Take the log of Distance_to_rådhus
mean_distance['log_Distance_to_rådhus'] = mean_distance['Distance_to_rådhus'].apply(lambda x: np.log(x))

# Merge the two dataframes
final_df = pd.merge(mean_price_per_kvdm_2023, mean_distance, on=['Kommune', 'Type'])

# Select and rename columns
final_df = final_df[['Kommune', 'Type', 'Procent_change', 'log_Distance_to_rådhus']]


# Plot for each type with linear regression
types = final_df['Type'].unique()
for property_type in types:
    subset = final_df[final_df['Type'] == property_type]
    
    # Scatter plot
    plt.figure()
    plt.scatter(subset['log_Distance_to_rådhus'], subset['Procent_change'], label='Data points')

    # Annotate data points with Kommune names
    for i, row in subset.iterrows():
        plt.annotate(row['Kommune'], 
                     (row['log_Distance_to_rådhus'], row['Procent_change']),
                     textcoords="offset points",
                     xytext=(0,5),
                     ha='center',
                     fontsize='xx-small')
    
    # Linear regression
    X = subset['log_Distance_to_rådhus'].values.reshape(-1, 1)
    y = subset['Procent_change'].values
    reg = LinearRegression().fit(X, y)
    y_pred = reg.predict(X)
    plt.plot(subset['log_Distance_to_rådhus'], y_pred, color='red', label='Linear regression')
    
    # Plot settings
    plt.title(f'Procent Change vs Log Distance to Rådhus for {property_type}')
    plt.xlabel('Log Distance to Rådhus')
    plt.ylabel('Procent Change in Price per kvm')
    plt.legend()
    plt.grid(True)
    
    # Save plot as image
    plt.savefig(f'{property_type}_plot.png')
    plt.close()  # Close the figure to ensure it does not display in the notebook
