## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso


## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
before_df = df.filter(regex=r'^(Before)', axis=1)
# before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

In [5]:
after_df = df.filter(regex=r'^(After)', axis=1)
# after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

In [6]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [7]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [8]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [9]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling"], axis=1)

## Missing Values

In [10]:
after_df.isnull().sum().sort_values(ascending=False)

After Dwellings                                               3
After Other single-attached house                             3
After Semi-detached house                                     1
After Row house                                               1
After Apartment, duplex                                       1
After Apartment, building that has fewer than five storeys    1
After Average number of bedrooms per dwelling                 1
After Population Density per square kilometre                 0
After Total Occupied Private Dwellings                        0
After Single-detached house                                   0
After Apartment, building that has five or more storeys       0
After Movable dwelling                                        0
After Owned                                                   0
After Rented                                                  0
dtype: int64

In [11]:
before_df.isnull().sum().sort_values(ascending=False)

Before Dwellings                                               28
Before Other single-attached house                             24
Before Apartment, building that has fewer than five storeys     3
Before Average number of bedrooms per dwelling                  3
Before Semi-detached house                                      2
Before Row house                                                2
Before Apartment, duplex                                        2
Before Apartment, building that has five or more storeys        1
Before Population Density per square kilometre                  0
Before Total Occupied Private Dwellings                         0
Before Single-detached house                                    0
Before Movable dwelling                                         0
Before Owned                                                    0
Before Rented                                                   0
dtype: int64

## Imputing with KNN

In [12]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [13]:
# This is just in case you want to check how reasonable it is
# df_unscaled = pd.DataFrame(scaler.inverse_transform(imputed_df), columns=imputed_df.columns)
# df_unscaled.head()

In [14]:
def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [15]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [16]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [26]:
difference_df = after_imputed - before_imputed

## Generating Synthetic Data

In [51]:
training_df = difference_df.copy()

In [52]:
training_df["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"].reset_index(drop = True)


### Method 1: Using Bootstrapping

In [57]:
df_synthetic = training_df.sample(frac=5.0, replace=True, random_state=53)
df_synthetic.head()

Unnamed: 0,Change Population Density per square kilometre,Change Dwellings,Change Total Occupied Private Dwellings,Change Single-detached house,Change Semi-detached house,Change Row house,"Change Apartment, duplex","Change Apartment, building that has fewer than five storeys","Change Apartment, building that has five or more storeys",Change Other single-attached house,Change Movable dwelling,Change Average number of bedrooms per dwelling,Change Owned,Change Rented,n_movers_out_Lowincome
25,-1.216285,0.119919,-0.42236,-1.08898,-1.814527,-1.954981,-1.133684,1.859494,-0.259102,-5.705208,0.067816,-5.359267,-0.246316,-0.486,22470.0
29,-0.474351,-0.2014,-1.473055,-3.308053,-0.672111,-3.813507,0.027304,-0.081414,-0.520533,0.085122,0.067816,-0.510741,-2.518538,-0.722204,15970.0
37,-0.072562,0.20966,0.130951,-0.168682,-0.148207,-0.345647,-0.125643,-0.059243,0.133236,0.018652,-0.916657,-0.081357,0.159224,0.083073,21165.0
61,-0.155135,0.597365,0.369433,-0.121095,-0.136343,-0.476389,-0.016107,-0.200647,0.424652,0.018652,0.067816,-1.316161,0.531509,0.250655,30985.0
22,0.19053,0.739089,0.532229,-0.673047,-0.026666,0.75485,0.125492,-0.040554,0.739349,0.018652,1.24233,2.23643,0.517987,0.49681,21540.0


## Simple Model

In [64]:
y_test = df["n_movers_out_Lowincome"]
X_test = difference_df

In [65]:
y_train = df_synthetic["n_movers_out_Lowincome"]
X_train = df_synthetic.drop(columns=["n_movers_out_Lowincome"], axis=1)

In [67]:
model = LinearRegression()
model.fit(X_train, y_train)

In [68]:
y_pred = model.predict(X_test)

In [69]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [70]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 93037669.02289641
R² Score: 0.5322777885558856
Intercept: 12906.495385024064
Coefficients: [   9812.1692379    19837.62201541 -136105.31086428    -752.43831557
   -8820.38345368    1397.21472455    5575.88273298     783.47949936
   19873.30421215    1823.33163455   -1116.60209243   -1387.50323978
   54084.22236631   58806.72934345]


## Ridge & Lasso Model

In [71]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [72]:
y_pred = model_r.predict(X_test)

In [73]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 97098636.80431286
R² Score: 0.5118623498279445
Intercept: 12876.187798019419
Coefficients: [ 12485.60616137  20354.57812584 -16000.28814408  -3469.64968813
  -8178.50504949   1219.31917588   3918.70001107  -2607.86572777
   7611.42216964   1882.07481419   -734.95027736  -2566.67286426
   8318.61584482  -5545.18912119]


In [75]:
model_l = Lasso()
model_l.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [76]:
y_pred = model_l.predict(X_test)

In [77]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 92811520.03132193
R² Score: 0.5334146926460885
Intercept: 12864.210834839118
Coefficients: [   9994.02374782   20061.84136125 -127082.77302122    -900.92394269
   -8784.84380271    1386.78215079    5464.92719363     552.95535534
   19119.73804201    1827.86071445   -1098.25435895   -1471.17971374
   50543.60577387   53813.88686878]


## Decision Tree Regressor

In [78]:
from sklearn.tree import DecisionTreeRegressor


In [79]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [80]:
y_pred = model_l.predict(X_test)

In [81]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 92811520.03132193
R² Score: 0.5334146926460885


## Random Forest

In [82]:
from sklearn.ensemble import RandomForestRegressor


In [83]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [84]:
y_pred = model_rf.predict(X_test)

In [85]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 514600.39312499994
R² Score: 0.997412982973346


## XGBoost


In [86]:
from xgboost import XGBRegressor

In [87]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [88]:
y_pred = model_xgb.predict(X_test)

In [89]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 5.6037131477804745e-06
R² Score: 0.9999999999999718


## Neural Network

In [90]:
from sklearn.neural_network import MLPRegressor

In [91]:
model_nn = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000)
model_nn.fit(X_train, y_train)



In [92]:
y_pred = model_nn.predict(X_test)

In [93]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 22854.09279887315
R² Score: 0.9998851071083713


### Method 2: Guassian Noise

In [110]:
training_df = difference_df.copy()
training_df["n_movers_out_Lowincome"] = df["n_movers_out_Lowincome"].reset_index(drop = True)

In [111]:
# Assuming df has features X and target y
X_original = training_df.drop('n_movers_out_Lowincome', axis=1)
y_original = training_df['n_movers_out_Lowincome']

# Add noise
X_noise = X_original + np.random.normal(0, 0.01, X_original.shape)
y_noise = y_original + np.random.normal(0, 0.01, y_original.shape)

In [112]:
X = pd.concat([X_original, X_noise], ignore_index = True)
y = pd.concat([y_original, y_noise], ignore_index = True)

In [113]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=53)

## Simple Model

In [115]:
model = LinearRegression()
model.fit(X_train, y_train)

In [117]:
y_pred = model.predict(X_test)

In [118]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [119]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 92025651.02343367
R² Score: 0.537365440704753
Intercept: 11547.956547532189
Coefficients: [   8900.72529172   29960.90243341 -105524.8684135      622.98091699
   -8257.03891594    2586.70539004    3390.48253        896.97121804
   19231.866335      1748.11319062    -982.48102285   -1461.03733741
   36421.30798873   42501.8530614 ]


## Ridge & Lasso Model

In [120]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [121]:
y_pred = model_r.predict(X_test)

In [122]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 96965129.9218996
R² Score: 0.5125335202790224
Intercept: 12956.257322314596
Coefficients: [12034.06660034 22164.57416361 -6980.87007178 -2986.65078432
 -7132.11834602  2761.03094779  1905.7438665  -2534.25219796
  5631.51673158  1768.9942373   -616.10459069 -2295.96965706
  1595.72119853 -6828.70445499]


In [123]:
model_l = Lasso()
model_l.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [124]:
y_pred = model_l.predict(X_test)

In [125]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 91963711.56217071
R² Score: 0.5376768249225847
Intercept: 11502.164436202027
Coefficients: [  9021.02696458  30061.3522665  -98210.07268937    483.50275893
  -8200.35060239   2610.08654193   3356.16427056    740.04212702
  18566.30249114   1752.28088237   -975.19365234  -1517.90283676
  33542.41752743  38410.59106956]


## Decision Tree Regressor

In [128]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [129]:
y_pred = model_l.predict(X_test)

In [130]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 91963711.56217071
R² Score: 0.5376768249225847


## Random Forest

In [131]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [132]:
y_pred = model_rf.predict(X_test)

In [133]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 16704364.442614341
R² Score: 0.9160232370402009


## XGBoost


In [134]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [135]:
y_pred = model_xgb.predict(X_test)

In [140]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 151584426.55579636
R² Score: 0.23794948912878733
Intercept: [17486.227]


AttributeError: Coefficients are not defined for Booster type None

## Neural Network

In [137]:
model_nn = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000)
model_nn.fit(X_train, y_train)



In [138]:

y_pred = model_nn.predict(X_test)

In [139]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 151584426.55579636
R² Score: 0.23794948912878733
