## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold


## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
df = df.dropna(subset=['n_stayers'])

## New Model EDA

In [5]:
original_population = df["n_movers_out"] + df["n_stayers"]
original_low_income_population = df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"]

In [6]:
y1 = df["n_movers_out"] / original_population
y2 = df["n_movers_out_Lowincome"] / original_low_income_population

In [7]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [8]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [9]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [10]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [11]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Missing Values

In [12]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [13]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [14]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [15]:
# These aren't great, but if I get rid of it it's only 3 rows that are removed

def kkn_imputation(df):
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

In [16]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [17]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [18]:
before_imputed_corr = before_imputed.copy()
after_imputed_corr = after_imputed.copy()

In [19]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [20]:
difference_df = after_imputed - before_imputed

## Looking at Correlation

In [21]:
difference_corr = difference_df.copy()

difference_corr["%n_movers_out"] = y1
before_imputed_corr["%n_movers_out"] = y1
after_imputed_corr["%n_movers_out"] = y1

difference_corr["%n_movers_out_Lowincome"] = y2
before_imputed_corr["%n_movers_out_Lowincome"] = y2
after_imputed_corr["%n_movers_out_Lowincome"] = y2

In [22]:
corr = difference_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)
b_corr = before_imputed_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)
a_corr = after_imputed_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)

In [23]:
# Correlation of change columns
corr

%n_movers_out_Lowincome                                        1.000000
%n_movers_out                                                  0.956534
Change Semi-detached house                                     0.183692
Change Average number of bedrooms per dwelling                 0.111873
Change Single-detached house                                   0.024313
Change Movable dwelling                                        0.020216
Change Apartment, duplex                                      -0.041589
Change Apartment, building that has five or more storeys      -0.098551
Change Owned                                                  -0.127169
Change Population Density per square kilometre                -0.133894
Change Row house                                              -0.151952
Change Total Occupied Private Dwellings                       -0.167065
Change Apartment, building that has fewer than five storeys   -0.167980
Change Dwellings                                              -0

In [24]:
# Correlation of "Before" columns
b_corr

%n_movers_out_Lowincome                                        1.000000
%n_movers_out                                                  0.956534
Before Single-detached house                                  -0.034484
Before Row house                                              -0.039173
Before Average number of bedrooms per dwelling                -0.047745
Before Movable dwelling                                       -0.073202
Before Semi-detached house                                    -0.077029
Before Apartment, building that has fewer than five storeys   -0.078833
Before Apartment, duplex                                      -0.124120
Before Apartment, building that has five or more storeys      -0.148384
Before Rented                                                 -0.159894
Before Dwellings                                              -0.178927
Before Population Density per square kilometre                -0.182199
Before Total Occupied Private Dwellings                       -0

In [25]:
# Correlation of "After" columns
a_corr

%n_movers_out_Lowincome                                       1.000000
%n_movers_out                                                 0.956534
After Semi-detached house                                     0.033958
After Average number of bedrooms per dwelling                -0.000006
After Single-detached house                                  -0.022760
After Movable dwelling                                       -0.084982
After Apartment, duplex                                      -0.115514
After Row house                                              -0.129231
After Apartment, building that has fewer than five storeys   -0.130364
After Apartment, building that has five or more storeys      -0.130727
After Rented                                                 -0.190470
After Dwellings                                              -0.200417
After Total Occupied Private Dwellings                       -0.207220
After Owned                                                  -0.212590
After 

## Preparing Training Data

In [26]:
combined_df["original_population"] = original_population.reset_index()[0]
combined_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [27]:
def log_and_standardize(df):
    log_df = np.log(df + 1)
    standardized_df = (log_df - log_df.mean()) / log_df.std()
    return standardized_df

In [28]:
training_data = log_and_standardize(combined_df)

In [29]:
difference_df["original_population"] = original_population.reset_index()[0]
difference_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [30]:
difference_df = (difference_df - difference_df.mean()) / difference_df.std()

## Ridge & Lasso Model

In [31]:
# X = training_data
X = difference_df

In [32]:
# X = training_data[["Before Population Density per square kilometre", 
#              "Before Dwellings", 
#              "Before Average number of bedrooms per dwelling", 
#              "Before Apartment, building that has five or more storeys",
#              "Before Total Occupied Private Dwellings",
#              "After Population Density per square kilometre", 
#              "After Dwellings", 
#              "After Average number of bedrooms per dwelling", 
#              "After Apartment, building that has five or more storeys",
#              "After Total Occupied Private Dwellings",
#             ]]

In [33]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.2, random_state=53)

In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=53)  # 5 folds
model = Ridge(alpha = 1000)

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y2.iloc[train_index], y2.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 0.02119046916993448
Mean R² across folds: -0.11077529195804343


In [35]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [36]:
y_pred = model_r.predict(X_test)

In [37]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 0.011034728911380446
R² Score: 0.32729807112836085
Intercept: 0.6816791563513616
Coefficients: [ 0.00622988  0.01084849  0.02003445 -0.07522071  0.00660802  0.07344509
 -0.01163758  0.04440567  0.00373481  0.01027632  0.00024799 -0.03195719
 -0.05669115  0.03157689 -0.02144245]


In [38]:
model_l = Lasso()
model_l.fit(X_train, y_train)

In [39]:
y_pred = model_l.predict(X_test)

In [40]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778
Intercept: 0.6874835120843216
Coefficients: [ 0. -0. -0. -0. -0.  0. -0.  0. -0.  0. -0.  0. -0.  0. -0.]


## Decision Tree Regressor

In [41]:
from sklearn.tree import DecisionTreeRegressor


In [42]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [43]:
y_pred = model_l.predict(X_test)

In [44]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778


## Random Forest

In [45]:
from sklearn.ensemble import RandomForestRegressor


In [46]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [47]:
y_pred = model_rf.predict(X_test)

In [48]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.0016363035765690726
R² Score: 0.9002472483902757


## XGBoost


In [49]:
from xgboost import XGBRegressor

In [50]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [51]:
y_pred = model_xgb.predict(X_test)

In [52]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 2.8527100338606883e-07
R² Score: 0.999982609237094


In [53]:
model_xgb.feature_importances_

array([0.01044624, 0.11264598, 0.03513302, 0.0599172 , 0.04639675,
       0.11331137, 0.00075454, 0.14199468, 0.04127545, 0.02826564,
       0.04949925, 0.01350849, 0.25210485, 0.0027319 , 0.09201462],
      dtype=float32)

## Statisical Models Test

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

In [58]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [59]:
model = LinearRegression()
scores = cross_val_score(model, X, y1, cv=10, scoring='neg_mean_squared_error')
print("neg MSE:", np.mean(scores))

neg MSE: -0.13587288534609132


In [60]:
import statsmodels.api as sm

In [61]:
y2 = y2 * 100

In [62]:
X = sm.add_constant(X)
model = sm.OLS(y2.reset_index()[0], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.572
Model:                            OLS   Adj. R-squared:                  0.403
Method:                 Least Squares   F-statistic:                     3.384
Date:                Wed, 30 Jul 2025   Prob (F-statistic):            0.00119
Time:                        15:43:32   Log-Likelihood:                -197.54
No. Observations:                  54   AIC:                             427.1
Df Residuals:                      38   BIC:                             458.9
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------