## Setup

In [216]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold


## EDA

In [217]:
df = pd.read_csv('../data/tod-on-main.csv')

In [218]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [219]:
df = df.dropna(subset=['n_stayers'])

## New Model EDA

In [220]:
original_population = df["n_movers_out"] + df["n_stayers"]
original_low_income_population = df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"]

In [221]:
y1 = df["n_movers_out"] / original_population
y2 = df["n_movers_out_Lowincome"] / original_low_income_population

In [222]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [223]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [224]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [225]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [226]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Missing Values

In [227]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [228]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [229]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [230]:
# These aren't great, but if I get rid of it it's only 3 rows that are removed

def kkn_imputation(df):
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

In [231]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [232]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [233]:
before_imputed_corr = before_imputed.copy()
after_imputed_corr = after_imputed.copy()

In [234]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [235]:
difference_df = after_imputed - before_imputed

## Looking at Correlation

In [236]:
difference_corr = difference_df.copy()

difference_corr["%n_movers_out"] = y1
before_imputed_corr["%n_movers_out"] = y1
after_imputed_corr["%n_movers_out"] = y1

difference_corr["%n_movers_out_Lowincome"] = y2
before_imputed_corr["%n_movers_out_Lowincome"] = y2
after_imputed_corr["%n_movers_out_Lowincome"] = y2

In [237]:
corr = difference_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)
b_corr = before_imputed_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)
a_corr = after_imputed_corr.corr()['%n_movers_out_Lowincome'].sort_values(ascending=False)

In [238]:
# Correlation of change columns
corr

%n_movers_out_Lowincome                                        1.000000
%n_movers_out                                                  0.956534
Change Semi-detached house                                     0.182056
Change Single-detached house                                   0.024313
Change Movable dwelling                                        0.020216
Change Average number of bedrooms per dwelling                 0.003933
Change Apartment, duplex                                      -0.039452
Change Apartment, building that has five or more storeys      -0.098551
Change Owned                                                  -0.127169
Change Row house                                              -0.152369
Change Total Occupied Private Dwellings                       -0.167065
Change Apartment, building that has fewer than five storeys   -0.168416
Change Dwellings                                              -0.178001
Change Rented                                                 -0

In [239]:
# Correlation of "Before" columns
b_corr

%n_movers_out_Lowincome                                        1.000000
%n_movers_out                                                  0.956534
Before Single-detached house                                  -0.034484
Before Row house                                              -0.036480
Before Movable dwelling                                       -0.073202
Before Semi-detached house                                    -0.076497
Before Apartment, building that has fewer than five storeys   -0.078602
Before Population Density per square kilometre                -0.101816
Before Apartment, duplex                                      -0.125275
Before Average number of bedrooms per dwelling                -0.139678
Before Apartment, building that has five or more storeys      -0.148384
Before Rented                                                 -0.159894
Before Dwellings                                              -0.178927
Before Total Occupied Private Dwellings                       -0

In [240]:
# Correlation of "After" columns
a_corr

%n_movers_out_Lowincome                                       1.000000
%n_movers_out                                                 0.956534
After Semi-detached house                                     0.033958
After Single-detached house                                  -0.022760
After Movable dwelling                                       -0.084982
After Apartment, duplex                                      -0.115514
After Row house                                              -0.129231
After Apartment, building that has fewer than five storeys   -0.130364
After Apartment, building that has five or more storeys      -0.130727
After Average number of bedrooms per dwelling                -0.146042
After Rented                                                 -0.190470
After Dwellings                                              -0.200417
After Total Occupied Private Dwellings                       -0.207220
After Owned                                                  -0.212590
After 

## Preparing Training Data

In [241]:
combined_df

Unnamed: 0,Before Population Density per square kilometre,Before Dwellings,Before Total Occupied Private Dwellings,Before Single-detached house,Before Semi-detached house,Before Row house,"Before Apartment, duplex","Before Apartment, building that has fewer than five storeys","Before Apartment, building that has five or more storeys",Before Movable dwelling,...,After Single-detached house,After Semi-detached house,After Row house,"After Apartment, duplex","After Apartment, building that has fewer than five storeys","After Apartment, building that has five or more storeys",After Movable dwelling,After Average number of bedrooms per dwelling,After Owned,After Rented
0,29557.199802,2165.500341,2098.182095,1036.355861,171.559124,232.989487,157.291638,408.144983,89.45694,0.081414,...,993.815921,161.958617,234.228216,196.057126,465.721264,88.992001,0.075891,2.958369,1250.364815,926.236364
1,10145.742414,1327.753873,1249.277296,1006.500775,107.212997,90.911065,0.0,42.502658,0.0,0.0,...,1007.011309,95.386329,133.722028,2.103387,102.108253,0.0,0.0,3.57035,1203.072199,109.974286
2,2126.62802,1654.715034,1542.772777,820.764067,78.968492,394.414058,17.42814,218.439627,0.0,9.587018,...,491.713881,219.862074,356.921142,33.438833,414.451887,0.192841,0.0,19.964445,886.394312,643.709179
3,3248.000589,1823.03749,1701.261038,950.684979,70.079055,169.462046,34.626913,374.362826,244.245214,0.0,...,955.654363,30.856989,86.5765,40.805173,401.268184,258.171781,0.0,3.806435,1070.279938,705.160154
4,1828.403352,1434.431075,1362.387422,960.920963,44.427922,148.984013,0.008759,117.349941,0.0,91.828328,...,883.629555,7.977719,198.702442,10.010645,226.559643,0.0,0.0,23.760836,1168.193247,149.384926
5,11504.057103,1307.697165,1284.715015,931.000872,76.948034,123.533997,13.474576,54.046761,89.419224,0.0,...,971.430056,104.344513,118.237337,12.615324,42.315994,87.236451,0.0,3.24886,1115.969126,218.492103
6,2156.400839,1662.613343,1573.678353,853.668615,97.09163,180.141741,0.0,316.990974,120.739483,0.0,...,781.89081,132.501053,410.733627,10.513666,382.797021,264.1735,0.0,20.469165,1222.883111,750.977519
7,5409.352545,805.7643,731.596286,422.626714,56.363281,49.549116,19.326577,190.037651,0.0,0.0,...,413.84627,56.336132,63.85912,22.881912,412.285193,3.442397,0.0,2.593038,772.264976,219.858205
8,12558.554503,367.922642,365.653846,285.614668,60.807883,2.949494,8.989262,4.979689,0.0,0.461726,...,312.535191,75.89938,0.797481,25.839836,1.53936,0.0,0.0,3.428903,317.387317,62.271107
9,47124.943595,1755.247936,1637.752466,1433.22128,0.0,156.731976,47.489713,0.0,0.0,0.0,...,1806.899375,19.088233,356.928558,335.981803,64.527636,0.0,0.0,3.320223,2059.244309,543.229468


In [242]:
combined_df["original_population"] = original_population.reset_index()[0]
combined_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [243]:
def log_and_standardize(df):
    log_df = np.log(df + 1)
    standardized_df = (log_df - log_df.mean()) / log_df.std()
    return standardized_df

In [244]:
training_data = log_and_standardize(combined_df)

## Ridge & Lasso Model

In [245]:
X = training_data

In [246]:
# X = training_data[["Before Population Density per square kilometre", 
#              "Before Dwellings", 
#              "Before Average number of bedrooms per dwelling", 
#              "Before Apartment, building that has five or more storeys",
#              "Before Total Occupied Private Dwellings",
#              "After Population Density per square kilometre", 
#              "After Dwellings", 
#              "After Average number of bedrooms per dwelling", 
#              "After Apartment, building that has five or more storeys",
#              "After Total Occupied Private Dwellings",
#             ]]

In [247]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=53)

In [250]:
kf = KFold(n_splits=5, shuffle=True, random_state=53)  # 5 folds
model = Ridge(alpha = 1000)

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y2.iloc[train_index], y2.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 0.020069983599491994
Mean R² across folds: -0.05227676695254544


In [252]:
model_r = Ridge(alpha = 100)
model_r.fit(X_train, y_train)

In [253]:
y_pred = model_r.predict(X_test)

In [254]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 0.025095939278363844
R² Score: 0.1103653302541241
Intercept: 0.6830528032712408
Coefficients: [-1.30447718e-02 -4.67355794e-03 -3.94435839e-03  6.89796154e-03
 -1.09971833e-03 -3.58097414e-03  5.61586331e-04 -8.87476504e-04
 -1.77221363e-03 -3.72952707e-03 -1.15034664e-02 -2.78374548e-03
 -2.55418964e-03 -1.60570982e-03 -1.57526642e-03 -5.96069212e-04
  4.92329672e-03 -1.33533769e-03  5.08327112e-05  7.45964363e-03
 -2.38128937e-03  1.21714251e-03  1.15215038e-02  1.92634882e-02
  7.52292031e-04 -2.35782723e-03  9.72986965e-03  6.39513824e-03]


In [255]:
model_l = Lasso(alpha=100, max_iter = 10000)
model_l.fit(X_train, y_train)

In [256]:
y_pred = model_l.predict(X_test)

In [257]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 0.029335217286080902
R² Score: -0.03991430855611933
Intercept: 0.6874835120843216
Coefficients: [-0. -0. -0.  0.  0. -0.  0. -0. -0. -0. -0. -0. -0. -0. -0. -0.  0. -0.
 -0.  0. -0. -0.  0.  0. -0. -0.  0.  0.]


## Decision Tree Regressor

In [109]:
from sklearn.tree import DecisionTreeRegressor


In [110]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [111]:
y_pred = model_l.predict(X_test)

In [112]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3319853492.5267296
R² Score: -0.422670052291934


## Random Forest

In [113]:
from sklearn.ensemble import RandomForestRegressor


In [114]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [115]:
y_pred = model_rf.predict(X_test)

In [116]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3350593157.8358927
R² Score: -0.435843043615578


## XGBoost


In [117]:
from xgboost import XGBRegressor

In [118]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [119]:
y_pred = model_xgb.predict(X_test)

In [120]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3214609359.969157
R² Score: -0.377569364594073
