## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso


## EDA

In [25]:
df = pd.read_csv('../data/tod-on-main.csv')

In [26]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [27]:
before_df = df.filter(regex=r'^(Before)', axis=1)
# before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

In [28]:
after_df = df.filter(regex=r'^(After)', axis=1)
# after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

In [29]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [30]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [32]:
after_df[cols_to_drop].isnull().sum()

After No bedrooms           40
After 0 to 1 bedroom        28
After 1 bedroom             40
After 2 bedrooms            28
After 3 bedrooms            28
After 4 or more bedrooms    28
dtype: int64

In [34]:
col_bedroom_names = after_df.filter(regex='bedrooms?$', axis=1).columns

In [8]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
# cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
# after_df = after_df.drop(columns = cols_to_drop)

In [9]:
# Remove other dwelling and other single-attached (co-linearity) 
# & remove apartment (remove bedrooms, maybe add it back again later)

In [10]:
# Do we maybe even remove Dwellings since there is collinearity?

In [11]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house", "Before Dwellings"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house", "After Dwellings"], axis=1)

In [12]:
before_df.columns, before_df.shape

(Index(['Before Population Density per square kilometre',
        'Before Total Occupied Private Dwellings',
        'Before Single-detached house', 'Before Semi-detached house',
        'Before Row house', 'Before Apartment, duplex',
        'Before Apartment, building that has fewer than five storeys',
        'Before Apartment, building that has five or more storeys',
        'Before Movable dwelling',
        'Before Average number of bedrooms per dwelling', 'Before Owned',
        'Before Rented'],
       dtype='object'),
 (68, 12))

In [13]:
after_df.columns, after_df.shape

(Index(['After Population Density per square kilometre',
        'After Total Occupied Private Dwellings', 'After Single-detached house',
        'After Semi-detached house', 'After Row house',
        'After Apartment, duplex',
        'After Apartment, building that has fewer than five storeys',
        'After Apartment, building that has five or more storeys',
        'After Movable dwelling',
        'After Average number of bedrooms per dwelling', 'After Owned',
        'After Rented'],
       dtype='object'),
 (68, 12))

## Imputing with KNN

In [19]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [20]:
# This is just in case you want to check how reasonable it is
# df_unscaled = pd.DataFrame(scaler.inverse_transform(imputed_df), columns=imputed_df.columns)
# df_unscaled.head()

In [21]:
def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [22]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [23]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [24]:
difference_df = after_imputed - before_imputed

## Adding bedrooms

In [41]:
after_bedrooms = after_df[col_bedroom_names]

In [42]:
scaler = StandardScaler()
scaled_after_bedrooms = pd.DataFrame(scaler.fit_transform(after_bedrooms), columns=after_bedrooms.columns)

In [43]:
difference_df[col_bedroom_names] = scaled_after_bedrooms

In [45]:
difference_df_w_bedrooms = kkn_imputation(difference_df)

## Ridge & Lasso Model

In [47]:
y = df["n_movers_out_Lowincome"]
X = difference_df_w_bedrooms

In [48]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [49]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [50]:
y_pred = model_r.predict(X_test)

In [51]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 162782351.94515142
R² Score: 0.16900323563384712
Intercept: 21153.68791274329
Coefficients: [ 9625.57751312  -162.31625837  -626.65672367 -3469.33367508
   372.65625099  2151.47149485   555.39499173  5529.32063129
 -5059.61655644 -2304.60340963  2714.57680562 -8556.03711358
  2269.79786506  2579.08965449  7512.34636696 -3736.03018559
  4466.71327053 -2639.30255059]


In [52]:
model_l = Lasso()
model_l.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [53]:
y_pred = model_l.predict(X_test)

In [54]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 229963881.12928945
R² Score: -0.17395552316329677
Intercept: 23771.011385133934
Coefficients: [ 11804.63150742  33762.72983614    400.56842283  -3773.92131684
    381.5931289    2529.9997835    2177.13469103   7042.38723036
  -6640.85597658  -4878.16327812 -19270.71494779 -28722.49523526
   2070.84688216  -2354.91450094  23760.92619984 -10147.10460292
   6321.59029948  -4853.51923807]


## Decision Tree Regressor

In [55]:
from sklearn.tree import DecisionTreeRegressor


In [56]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [57]:
y_pred = model_l.predict(X_test)

In [58]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 229963881.12928945
R² Score: -0.17395552316329677


## Random Forest

In [59]:
from sklearn.ensemble import RandomForestRegressor


In [60]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [61]:
y_pred = model_rf.predict(X_test)

In [62]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 105516089.69500001
R² Score: 0.46134499177983235


## XGBoost


In [63]:
from xgboost import XGBRegressor

In [64]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [65]:
y_pred = model_xgb.predict(X_test)

In [66]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 190517293.39972517
R² Score: 0.02741757641938125


## Neural Network

In [67]:
from sklearn.neural_network import MLPRegressor

In [68]:
model_nn = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000)
model_nn.fit(X_train, y_train)



In [69]:
y_pred = model_nn.predict(X_test)

In [70]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 531598125.9262733
R² Score: -1.7137851082081124
