## Setup

In [260]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold


## EDA

In [261]:
df = pd.read_csv('../data/tod-on-main.csv')

In [262]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

## New Model EDA

In [263]:
y1 = df["n_movers_out"]
y2 = df["n_movers_out_Lowincome"]
y3 = df["n_movers_in"]
y4 = df["n_movers_in_Lowincome"]

In [264]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [265]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [266]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [267]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [268]:
# Remove other dwelling and other single-attached (co-linearity) 
# & remove apartment (remove bedrooms, maybe add it back again later)

In [269]:
# Do we maybe even remove Dwellings since there is collinearity?

In [270]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Missing Values

In [271]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [272]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [273]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [274]:
def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [275]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [276]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [277]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [278]:
difference_df = after_imputed - before_imputed

## Better Correlation

In [279]:
difference_corr = difference_df.copy()
before_imputed_corr = before_imputed.copy()
after_imputed_corr = after_imputed.copy()

difference_corr["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"]
before_imputed_corr["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"]
after_imputed_corr["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"]

In [280]:
corr = difference_corr.corr()['n_movers_out_Lowincome'].sort_values(ascending=False)
b_corr = before_imputed_corr.corr()['n_movers_out_Lowincome'].sort_values(ascending=False)
a_corr = after_imputed_corr.corr()['n_movers_out_Lowincome'].sort_values(ascending=False)

In [281]:
corr

n_movers_out_Lowincome                                         1.000000
Change Rented                                                  0.333942
Change Row house                                               0.291386
Change Total Occupied Private Dwellings                        0.290506
Change Owned                                                   0.226051
Change Dwellings                                               0.221485
Change Population Density per square kilometre                 0.103419
Change Single-detached house                                   0.090542
Change Apartment, duplex                                       0.089983
Change Average number of bedrooms per dwelling                 0.052837
Change Apartment, building that has five or more storeys       0.046196
Change Apartment, building that has fewer than five storeys    0.032038
Change Movable dwelling                                        0.013492
Change Semi-detached house                                    -0

In [282]:
b_corr

n_movers_out_Lowincome                                         1.000000
Change Apartment, building that has five or more storeys       0.336406
Change Population Density per square kilometre                 0.250163
Change Dwellings                                               0.191288
Change Total Occupied Private Dwellings                        0.148792
Change Rented                                                  0.132586
Change Owned                                                   0.123314
Change Row house                                              -0.002566
Change Semi-detached house                                    -0.038048
Change Average number of bedrooms per dwelling                -0.051089
Change Movable dwelling                                       -0.079914
Change Apartment, duplex                                      -0.098594
Change Apartment, building that has fewer than five storeys   -0.141175
Change Single-detached house                                  -0

In [283]:
a_corr

n_movers_out_Lowincome                                         1.000000
Change Apartment, building that has five or more storeys       0.347393
Change Population Density per square kilometre                 0.293594
Change Dwellings                                               0.249453
Change Total Occupied Private Dwellings                        0.243442
Change Rented                                                  0.240914
Change Owned                                                   0.221236
Change Row house                                               0.206629
Change Average number of bedrooms per dwelling                 0.038705
Change Apartment, duplex                                      -0.059306
Change Movable dwelling                                       -0.062478
Change Semi-detached house                                    -0.079494
Change Apartment, building that has fewer than five storeys   -0.126503
Change Single-detached house                                  -0

## Ridge & Lasso Model

In [284]:
combined_df.columns

Index(['Before Population Density per square kilometre', 'Before Dwellings',
       'Before Total Occupied Private Dwellings',
       'Before Single-detached house', 'Before Semi-detached house',
       'Before Row house', 'Before Apartment, duplex',
       'Before Apartment, building that has fewer than five storeys',
       'Before Apartment, building that has five or more storeys',
       'Before Movable dwelling',
       'Before Average number of bedrooms per dwelling', 'Before Owned',
       'Before Rented', 'After Population Density per square kilometre',
       'After Dwellings', 'After Total Occupied Private Dwellings',
       'After Single-detached house', 'After Semi-detached house',
       'After Row house', 'After Apartment, duplex',
       'After Apartment, building that has fewer than five storeys',
       'After Apartment, building that has five or more storeys',
       'After Movable dwelling',
       'After Average number of bedrooms per dwelling', 'After Owned',
     

In [285]:
X = combined_df[["Before Population Density per square kilometre", 
             "Before Dwellings", 
             "Before Average number of bedrooms per dwelling", 
             "Before Apartment, building that has five or more storeys",
             "Before Total Occupied Private Dwellings",
             "After Population Density per square kilometre", 
             "After Dwellings", 
             "After Average number of bedrooms per dwelling", 
             "After Apartment, building that has five or more storeys",
             "After Total Occupied Private Dwellings",
            ]]

In [286]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=53)

In [287]:
kf = KFold(n_splits=5, shuffle=True, random_state=53)  # 5 folds
model = Ridge(alpha = 1000)

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y2.iloc[train_index], y2.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 180984382.00882477
Mean R² across folds: -0.014259697564182394


In [288]:
model_r = Ridge(alpha = 100)
model_r.fit(X_train, y_train)

In [289]:
y_pred = model_r.predict(X_test)

In [290]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 4024791133.250008
R² Score: -0.7247597898203935
Intercept: 18094.531368972755
Coefficients: [-187.11127587 1176.98120506 -581.87509885   88.97623912 1154.30289348
  917.05369334 1404.10313883  950.94890149  551.37109443 1465.75822157]


In [291]:
model_l = Lasso(alpha=100, max_iter = 10000)
model_l.fit(X_train, y_train)

In [292]:
y_pred = model_l.predict(X_test)

In [293]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 3319853492.5267296
R² Score: -0.422670052291934
Intercept: 17919.320565734928
Coefficients: [ -9588.41599927      0.          -4572.16851848 -14993.85198809
   6285.58158186  13199.63840681   1989.77655621  -2448.70675306
   4789.19833591   6781.75502543]


## Decision Tree Regressor

In [294]:
from sklearn.tree import DecisionTreeRegressor


In [295]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [296]:
y_pred = model_l.predict(X_test)

In [297]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3319853492.5267296
R² Score: -0.422670052291934


## Random Forest

In [298]:
from sklearn.ensemble import RandomForestRegressor


In [299]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [300]:
y_pred = model_rf.predict(X_test)

In [301]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3350593157.8358927
R² Score: -0.435843043615578


## XGBoost


In [302]:
from xgboost import XGBRegressor

In [303]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [258]:
y_pred = model_xgb.predict(X_test)

In [259]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3214609359.969157
R² Score: -0.377569364594073
