## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso


## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
before_df = df.filter(regex=r'^(Before)', axis=1)
# before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

In [5]:
after_df = df.filter(regex=r'^(After)', axis=1)
# after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

In [6]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [7]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [8]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [11]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling"], axis=1)

## Missing Values

In [12]:
after_df.isnull().sum().sort_values(ascending=False)

After Dwellings                                               3
After Other single-attached house                             3
After Semi-detached house                                     1
After Row house                                               1
After Apartment, duplex                                       1
After Apartment, building that has fewer than five storeys    1
After Average number of bedrooms per dwelling                 1
After Population Density per square kilometre                 0
After Total Occupied Private Dwellings                        0
After Single-detached house                                   0
After Apartment, building that has five or more storeys       0
After Movable dwelling                                        0
After Owned                                                   0
After Rented                                                  0
dtype: int64

In [13]:
before_df.isnull().sum().sort_values(ascending=False)

Before Dwellings                                               28
Before Other single-attached house                             24
Before Apartment, building that has fewer than five storeys     3
Before Average number of bedrooms per dwelling                  3
Before Semi-detached house                                      2
Before Row house                                                2
Before Apartment, duplex                                        2
Before Apartment, building that has five or more storeys        1
Before Population Density per square kilometre                  0
Before Total Occupied Private Dwellings                         0
Before Single-detached house                                    0
Before Movable dwelling                                         0
Before Owned                                                    0
Before Rented                                                   0
dtype: int64

## Imputing with KNN

In [14]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [15]:
# This is just in case you want to check how reasonable it is
# df_unscaled = pd.DataFrame(scaler.inverse_transform(imputed_df), columns=imputed_df.columns)
# df_unscaled.head()

In [16]:
def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [17]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [18]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [19]:
difference_df = after_imputed - before_imputed

## Simple Model

In [21]:
y = df["n_movers_out_Lowincome"]
X = difference_df

In [22]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [23]:
model = LinearRegression()
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [25]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [26]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 140534946.83296606
R² Score: 0.2825752625941058
Intercept: 12760.917881451107
Coefficients: [  14466.22583806   23853.21007969 -168920.53149597     833.36682455
  -10764.76981968    2096.19706216    4641.78460242    5168.31002196
   25697.013035      2817.68577827   -3813.78681725   -1794.61565274
   66580.26828108   69003.68039518]


## Ridge & Lasso Model

In [27]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [28]:
y_pred = model_r.predict(X_test)

In [29]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 138124621.99055585
R² Score: 0.29487986515808107
Intercept: 13705.361267597444
Coefficients: [12777.95988997 16543.15723453 -3503.96374469 -4610.75232274
 -6607.39278377  1787.67093359   316.090002    -164.9615842
  2383.50528873  2587.22003213 -3396.31608001 -3116.98959186
  4205.32301619 -4852.16749313]


In [30]:
model_l = Lasso()
model_l.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [31]:
y_pred = model_l.predict(X_test)

In [32]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 134186056.56825872
R² Score: 0.3149860688285878
Intercept: 12621.384309719811
Coefficients: [  14432.67774331   24178.61510022 -151453.90197205     434.21254651
  -10582.85174901    2145.608823      4353.41299832    4757.30161246
   23818.86447554    2873.53208104   -3794.7772053    -1993.44975853
   59601.11795375   60224.34561863]


## Decision Tree Regressor

In [33]:
from sklearn.tree import DecisionTreeRegressor


In [34]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [35]:
y_pred = model_l.predict(X_test)

In [36]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 134186056.56825872
R² Score: 0.3149860688285878


## Random Forest

In [37]:
from sklearn.ensemble import RandomForestRegressor


In [38]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [39]:
y_pred = model_rf.predict(X_test)

In [40]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 100861403.80696428
R² Score: 0.48510695900710143


## XGBoost


In [41]:
from xgboost import XGBRegressor

In [42]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [43]:
y_pred = model_xgb.predict(X_test)

In [44]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 109370672.16748682
R² Score: 0.4416675173832324


## Neural Network

In [45]:
from sklearn.neural_network import MLPRegressor

In [46]:
model_nn = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000)
model_nn.fit(X_train, y_train)



In [47]:
y_pred = model_nn.predict(X_test)

In [48]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 829713333.6675493
R² Score: -3.2356501634864383
