## Imports & Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## EDA

In [33]:
df = pd.read_csv('../data/tod-on-main.csv')

In [34]:
before_df = df.filter(regex=r'^(Before)', axis=1)
before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

In [35]:
after_df = df.filter(regex=r'^(After)', axis=1)
after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

In [36]:
before_df.columns = before_df.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_df.columns = after_df.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [37]:
difference_df = after_df - before_df

In [38]:
difference_df["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"]

In [39]:
df["n_movers_out_Lowincome"] / (df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"])

0     0.500874
1     0.466426
2     0.611448
3     0.713208
4     0.598786
        ...   
65    0.737852
66    0.519781
67    0.860560
68    0.458611
69    0.519352
Length: 70, dtype: float64

In [40]:
difference_df["%n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"] / (df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"])

In [41]:
difference_df.isnull().sum()


Change Population Density per square kilometre                  0
Change Dwellings                                               28
Change Total Occupied Private Dwellings                         0
Change Single-detached house                                    0
Change Semi-detached house                                      3
Change Row house                                                3
Change Apartment, duplex                                        3
Change Apartment, building that has fewer than five storeys     3
Change Apartment, building that has five or more storeys        1
Change Other dwelling                                          68
Change Other single-attached house                             24
Change Movable dwelling                                         0
Change 0 to 1 bedroom                                          64
Change 2 bedrooms                                              64
Change 3 bedrooms                                              64
Change 4 o

In [42]:
nan_counts = difference_df.isnull().sum()


In [43]:
columns_to_drop = nan_counts[nan_counts > 20].index

In [44]:
cleaned_df = difference_df.drop(columns=columns_to_drop)

In [45]:
cleaned_df

Unnamed: 0,Change Population Density per square kilometre,Change Total Occupied Private Dwellings,Change Single-detached house,Change Semi-detached house,Change Row house,"Change Apartment, duplex","Change Apartment, building that has fewer than five storeys","Change Apartment, building that has five or more storeys",Change Movable dwelling,Change Owned,Change Rented,n_movers_out_Lowincome,%n_movers_out_Lowincome
0,1095.597195,61.677499,-42.539940,-9.600507,1.238730,38.765488,57.576281,-0.464939,-0.005523,3.789884,96.124023,4300.0,0.500874
1,-779.491684,89.694156,0.510533,-11.826668,42.810963,2.103387,59.605595,0.000000,0.000000,9.125176,54.644013,3230.0,0.466426
2,25284.884324,-22.168435,-329.050186,140.893582,-37.492917,16.010693,196.012260,0.192841,-9.587018,-138.753774,127.887968,3525.0,0.611448
3,-54.058051,72.202927,4.969385,,,,,13.926568,0.000000,76.710062,-2.803474,7560.0,0.713208
4,18877.740252,-40.566288,-77.291407,-36.450202,49.718430,10.001887,109.209702,0.000000,-91.828328,39.130931,-84.510315,6410.0,0.598786
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,11680.829979,-188.049194,-67.607051,-17.698188,111.544572,62.970222,-166.473499,-93.101270,-4.177069,35.066295,-240.312060,6150.0,0.737852
66,136645.143559,6448.940354,-7.523282,-0.024781,122.743024,-4.739731,-214.872238,6542.339135,-5.000000,2555.753834,4257.012461,40400.0,0.519781
67,228103.384518,1145.759719,-262.354332,42.037694,93.541688,-106.739942,1020.346489,359.946458,0.000000,298.131957,819.841384,59895.0,0.860560
68,34677.052585,3002.637908,-5.000000,0.000000,49.852178,-29.009326,-297.934437,3269.791124,0.000000,1629.854442,1602.970045,31690.0,0.458611


In [46]:
cleaned_df = cleaned_df.fillna(cleaned_df.mean())

## Baseline Model
This is a super simple model that has removed columns with over 20 missing rows and imputed NaN values with mean. 

In [50]:
y = cleaned_df["%n_movers_out_Lowincome"]
X = cleaned_df.drop(columns=["n_movers_out_Lowincome", "%n_movers_out_Lowincome"])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [52]:
model = LinearRegression()
model.fit(X_train, y_train)

In [53]:
y_pred = model.predict(X_test)

In [54]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [55]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 0.016803526624756152
R² Score: -0.3765127883307351
Intercept: 0.6188715666617479
Coefficients: [ 7.36481919e-07  5.47193826e-04 -1.04147067e-04 -2.65887813e-06
  4.85982539e-04  3.59767073e-04  8.27042879e-05  5.27928674e-05
 -9.77734283e-04 -5.65183602e-04 -6.31263241e-04]


## Baseline with Standarization
(No change)

In [56]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(cleaned_df), columns=cleaned_df.columns)

In [63]:
y = cleaned_df["%n_movers_out_Lowincome"]
X = cleaned_df.drop(columns=["n_movers_out_Lowincome", "%n_movers_out_Lowincome"])

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [65]:
model = LinearRegression()
model.fit(X_train, y_train)

In [66]:
y_pred = model.predict(X_test)

In [67]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [68]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 0.016803526624756152
R² Score: -0.3765127883307351
Intercept: 0.6188715666617479
Coefficients: [ 7.36481919e-07  5.47193826e-04 -1.04147067e-04 -2.65887813e-06
  4.85982539e-04  3.59767073e-04  8.27042879e-05  5.27928674e-05
 -9.77734283e-04 -5.65183602e-04 -6.31263241e-04]


In [69]:
X_train.columns

Index(['Change Population Density per square kilometre',
       'Change Total Occupied Private Dwellings',
       'Change Single-detached house', 'Change Semi-detached house',
       'Change Row house', 'Change Apartment, duplex',
       'Change Apartment, building that has fewer than five storeys',
       'Change Apartment, building that has five or more storeys',
       'Change Movable dwelling', 'Change Owned', 'Change Rented'],
      dtype='object')