## TOD-ON-MAIN models

This notebook is a summary of all the models used for predicting the number of low_income_movers out given the changes in the area over a 10 year period

## 0. Setup

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [41]:
RANDOM_STATE = 53

In [42]:
df = pd.read_csv('../data/tod-on-main.csv')

In [43]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [44]:
before_df = df.filter(regex=r'^(Before)', axis=1)
before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

after_df = df.filter(regex=r'^(After)', axis=1)
after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

before_df_changed = before_df.copy()
after_df_changed = after_df.copy()
    
before_df_changed.columns = before_df_changed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_df_changed.columns = after_df_changed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [45]:
difference_df = after_df_changed - before_df_changed
difference_df["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"]

## 1. Baseline Model

This is a super simple model that has removed columns with over 20 missing rows and imputed NaN values with mean. It also standarizes the training data before training

In [7]:
nan_counts = difference_df.isnull().sum()
columns_to_drop = nan_counts[nan_counts > 20].index
baseline_model_df = difference_df.drop(columns=columns_to_drop)
baseline_model_df = baseline_model_df.fillna(baseline_model_df.mean())

In [8]:
scaler = StandardScaler()
scaled_baseline_model_df = pd.DataFrame(scaler.fit_transform(baseline_model_df), columns=baseline_model_df.columns)

In [9]:
y = baseline_model_df["n_movers_out_Lowincome"]
X = scaled_baseline_model_df.drop(columns=["n_movers_out_Lowincome"])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [11]:
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)

In [12]:
y_pred = baseline_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [13]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", baseline_model.intercept_)
print("Coefficients:", baseline_model.coef_)

Mean Squared Error (MSE): 395839691.40998524
R² Score: -1.0207442566023937
Intercept: 18373.394287709325
Coefficients: [  9714.94291922  38135.64045911   -562.49519293  -1261.81669812
  -1260.83179658   2082.43087854   2487.50033352  11075.21062012
  -8195.7629623  -17791.92387652 -31351.36631616]


This does not perform particularly well

## 2. EDA

The EDA done here is to remove all columns with almost no values and columns that will cause collinearity (e.g. "Other dwelling"). It adds a column called average number of bedrooms so that all rows have this value, then it removes all other bedroom information to remove collinearity.

In [46]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [47]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

calculating_average_bedrooms(before_df, True)
calculating_average_bedrooms(after_df, False)
warnings.filterwarnings('ignore')

In [48]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)

cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [49]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

# Remove those with very few values and those that will cause collinearity (e.g. adding all 'others' will sum to total dwellings)

### 2A. Simple KKN Imputer

This is the same as the baseline model except it does a K-Nearest-Neighbor to impute the missing values in each row. It takes the 3 most similar rows to it and calculates average value between those 3. (Suggestion for improvement: use a separate KNN model for 'before' values and 'after' values)

In [29]:
# Standardizes the data and then uses a KKN imputer to fill misisng values

def knn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

before_imputed = knn_imputation(before_df)
after_imputed = knn_imputation(after_df)

before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

simple_KNN_imputer_df = after_imputed - before_imputed

In [30]:
y = df["n_movers_out_Lowincome"]
X = simple_KNN_imputer_df

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [32]:
simple_KNN_imputer_model = LinearRegression()
simple_KNN_imputer_model.fit(X_train, y_train)

In [33]:
y_pred = simple_KNN_imputer_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [34]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", simple_KNN_imputer_model.intercept_)
print("Coefficients:", simple_KNN_imputer_model.coef_)

Mean Squared Error (MSE): 210528752.9317998
R² Score: -0.0747400464597956
Intercept: 18645.149282621947
Coefficients: [  20902.59705808   21229.3611383  -264609.4487143    -2021.28265436
   -6035.13371278     504.15086149    5884.75413319    3337.73602541
   27644.99720648   -4302.84205848    -835.43117687  107329.94196776
  119121.00302275]


### 2B. Other Imputers

This model also imputes the total number of dewllings by doing a linear regression between the number of total occupied dwellings, the number of tenants that own a dwelling, and the number of tenants that rent a dwelling

In [50]:
# Adding total number of dwellings using linear regression

def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

impute_dwellings(before_df)
impute_dwellings(after_df, False)

In [53]:
before_imputed = knn_imputation(before_df)
after_imputed = knn_imputation(after_df)

before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

other_KNN_imputer_df = after_imputed - before_imputed

In [54]:
y = df["n_movers_out_Lowincome"]
X = other_KNN_imputer_df

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [56]:
other_imputer_model = LinearRegression()
other_imputer_model.fit(X_train, y_train)

In [57]:
y_pred = other_imputer_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [58]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", other_imputer_model.intercept_)
print("Coefficients:", other_imputer_model.coef_)

Mean Squared Error (MSE): 210528752.9317998
R² Score: -0.0747400464597956
Intercept: 18645.149282621947
Coefficients: [  20902.59705808   21229.3611383  -264609.4487143    -2021.28265436
   -6035.13371278     504.15086149    5884.75413319    3337.73602541
   27644.99720648   -4302.84205848    -835.43117687  107329.94196776
  119121.00302275]


## 3. Adding Synthetic Data

The following models both add synthetic data so that there is more data to train on

In [31]:
synthetic_model_data = after_imputed - before_imputed
synthetic_model_data["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"].reset_index(drop = True)

### 3A. Using Bootstrapping

In [32]:
synthetic_model_data_1 = synthetic_model_data.sample(frac=5.0, replace=True, random_state=RANDOM_STATE)
synthetic_model_data_1.head()

Unnamed: 0,Change Population Density per square kilometre,Change Dwellings,Change Total Occupied Private Dwellings,Change Single-detached house,Change Semi-detached house,Change Row house,"Change Apartment, duplex","Change Apartment, building that has fewer than five storeys","Change Apartment, building that has five or more storeys",Change Movable dwelling,Change Average number of bedrooms per dwelling,Change Owned,Change Rented,n_movers_out_Lowincome
25,-1.216285,-0.267625,-0.42236,-1.08898,-1.814527,-1.954981,-1.133684,1.859494,-0.259102,0.067816,-5.359267,-0.246316,-0.486,22470.0
29,-0.474351,-1.285569,-1.473055,-3.308053,-0.672111,-3.813507,0.027304,-0.081414,-0.520533,0.067816,-0.510741,-2.518538,-0.722204,15970.0
37,-0.072562,0.162419,0.130951,-0.168682,-0.148207,-0.345647,-0.125643,-0.059243,0.133236,-0.916657,-0.081357,0.159224,0.083073,21165.0
61,-0.155135,0.373563,0.369433,-0.121095,-0.136343,-0.476389,-0.016107,-0.200647,0.424652,0.067816,-1.316161,0.531509,0.250655,30985.0
22,0.19053,0.518249,0.532229,-0.673047,-0.026666,0.75485,0.125492,-0.040554,0.739349,1.24233,2.23643,0.517987,0.49681,21540.0


In [33]:
y = synthetic_model_data_1["n_movers_out_Lowincome"]
X = synthetic_model_data_1.drop(columns=["n_movers_out_Lowincome"])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [35]:
bootstrapped_model = LinearRegression()
bootstrapped_model.fit(X_train, y_train)

In [36]:
y_pred = bootstrapped_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [37]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", bootstrapped_model.intercept_)
print("Coefficients:", bootstrapped_model.coef_)

Mean Squared Error (MSE): 128696525.87105072
R² Score: 0.4813658373158677
Intercept: 17137.825553869974
Coefficients: [ 1.10244945e+04  2.60122501e+04 -2.32611157e+05 -2.41770319e+03
 -7.15097756e+03  1.07481414e+03  5.05590103e+03 -2.49699782e+03
  1.93328007e+04  1.91736854e+02 -5.23845104e+02  8.99099124e+04
  1.10144391e+05]


### 3B. Using Gaussian Noise

In [38]:
X_original = synthetic_model_data.drop('n_movers_out_Lowincome', axis=1)
y_original = synthetic_model_data['n_movers_out_Lowincome']

# Add noise
X_noise = X_original + np.random.normal(0, 0.01, X_original.shape)
y_noise = y_original + np.random.normal(0, 0.01, y_original.shape)

In [39]:
X_synthetic_model_data_2 = pd.concat([X_original, X_noise], ignore_index = True)
y_synthetic_model_data_2 = pd.concat([y_original, y_noise], ignore_index = True)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_synthetic_model_data_2, y_synthetic_model_data_2, test_size=0.25, random_state=RANDOM_STATE)

In [41]:
gaussian_noise_model = LinearRegression()
gaussian_noise_model.fit(X_train, y_train)

In [42]:
y_pred = gaussian_noise_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [43]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", gaussian_noise_model.intercept_)
print("Coefficients:", gaussian_noise_model.coef_)

Mean Squared Error (MSE): 69852165.19938698
R² Score: 0.5449962995295057
Intercept: 17794.495849100145
Coefficients: [ 1.95069169e+04 -3.22250184e+01 -7.59802747e+04 -4.38942645e+03
 -6.28917184e+03  1.84217042e+03  1.86277133e+03 -4.06991890e+03
  1.34588791e+04 -2.83772214e+02 -2.41619180e+03  3.70416226e+04
  3.22608117e+04]


## 4. Testing Many Models

After testing many models in other notebooks, these are the ones that performed the best. (Suggestion: right now, each model builds on previous models, perhaps look into building them separately, e.g. add synthetic data without doing the KNN imputation).

### 4A. Random Forest Regressor with other imputer dataset

In [44]:
# Using the other imputer model and a random_forest_regressor

y = df["n_movers_out_Lowincome"]
X = other_KNN_imputer_df

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [46]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=53)
model_rf.fit(X_train, y_train)

In [47]:
y_pred = model_rf.predict(X_test)

In [48]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 89129416.19517858
R² Score: 0.5449982410071469


### 4B. Gradient Boosting model with other imputer dataset

In [49]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [50]:
y_pred = model_xgb.predict(X_test)

In [51]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 164869932.33453777
R² Score: 0.15834622934167353


### 4C. (Best Model) Random Forest Regressor with synthesized data (Bootstrapping)

In [52]:
y = synthetic_model_data_1["n_movers_out_Lowincome"]
X = synthetic_model_data_1.drop(columns=["n_movers_out_Lowincome"])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [54]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)
model_rf.fit(X_train, y_train)

In [55]:
y_pred = model_rf.predict(X_test)

In [56]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 8759505.00033088
R² Score: 0.96470006854788


### 4D. Random Forest Regressor with synthesized data (Gaussian Noise)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_synthetic_model_data_2, y_synthetic_model_data_2, test_size=0.25, random_state=RANDOM_STATE)

In [58]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)
model_rf.fit(X_train, y_train)

In [59]:
y_pred = model_rf.predict(X_test)

In [60]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 50615083.25202682
R² Score: 0.6703029875515376


## 5. Evaluation

Some things we can do in the future
- Look into the evaluation metrics of each model
- Try out some more models
- Do some hyperparameter tuning (autoML)
- Try to build better neural networks
- Think about the explainability
- Other data we should use?
- Other target values we should us
- Try to do bootstrapping, but separate test/train before