## Setup

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor

## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
df = df.dropna(subset=['n_stayers'])

## New Model EDA

In [5]:
original_population = df["n_movers_out"] + df["n_stayers"]
original_low_income_population = df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"]

In [6]:
y1 = df["n_movers_out"] / original_population
y2 = df["n_movers_out_Lowincome"] / original_low_income_population

In [7]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [8]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [9]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [10]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [11]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Missing Values

In [12]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [13]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [14]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [15]:
# These aren't great, but if I get rid of it it's only 3 rows that are removed

def kkn_imputation(df):
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

In [16]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [17]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [18]:
before_imputed_corr = before_imputed.copy()
after_imputed_corr = after_imputed.copy()

In [19]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [20]:
difference_df = after_imputed - before_imputed

## Preparing Training Data

In [31]:
combined_df["original_population"] = original_population.reset_index()[0]
combined_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [32]:
def log_and_standardize(df):
    log_df = np.log(df + 1)
    standardized_df = (log_df - log_df.mean()) / log_df.std()
    return standardized_df

In [33]:
training_data = log_and_standardize(combined_df)

In [30]:
difference_df["original_population"] = original_population.reset_index()[0]
difference_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [34]:
difference_df = (difference_df - difference_df.mean()) / difference_df.std()

## Removing Collinearity

In [96]:
difference_df_test = difference_df.copy()

In [97]:
# difference_df_test = difference_df_test.drop(columns = ["Change Total Occupied Private Dwellings", "Change Dwellings"], axis = 1)

In [50]:
vif_df = pd.DataFrame()
vif_df["feature"] = difference_df_test.columns
vif_df["VIF"] = [variance_inflation_factor(difference_df_test.values, i) for i in range(difference_df_test.shape[1])]

In [83]:
difference_df.columns

Index(['Change Population Density per square kilometre', 'Change Dwellings',
       'Change Total Occupied Private Dwellings',
       'Change Single-detached house', 'Change Semi-detached house',
       'Change Row house', 'Change Apartment, duplex',
       'Change Apartment, building that has fewer than five storeys',
       'Change Apartment, building that has five or more storeys',
       'Change Movable dwelling',
       'Change Average number of bedrooms per dwelling', 'Change Owned',
       'Change Rented', 'original_population',
       'original_low_income_population'],
      dtype='object')

In [113]:
tester = difference_df_test[['Change Single-detached house', 'Change Semi-detached house',
       'Change Row house', 'Change Apartment, duplex',
       'Change Apartment, building that has fewer than five storeys',
       'Change Apartment, building that has five or more storeys',
       'Change Movable dwelling',
       'Change Average number of bedrooms per dwelling']]

In [196]:
tester = difference_df_test[['Change Single-detached house', 'Change Semi-detached house',
       'Change Row house', 'Change Apartment, duplex',
       'Change Apartment, building that has fewer than five storeys',
       'Change Apartment, building that has five or more storeys',
       'Change Movable dwelling',
       'Change Average number of bedrooms per dwelling','Change Owned',
       'Change Rented','original_population']]

In [197]:
vif_df = pd.DataFrame()
vif_df["feature"] = tester.columns
vif_df["VIF"] = [variance_inflation_factor(tester.values, i) for i in range(tester.shape[1])]

In [198]:
vif_df

Unnamed: 0,feature,VIF
0,Change Single-detached house,32.684106
1,Change Semi-detached house,2.405558
2,Change Row house,6.26627
3,"Change Apartment, duplex",2.848648
4,"Change Apartment, building that has fewer than...",59.344119
5,"Change Apartment, building that has five or mo...",461.944173
6,Change Movable dwelling,1.418381
7,Change Average number of bedrooms per dwelling,1.886006
8,Change Owned,165.826384
9,Change Rented,169.455581


---

## Ridge & Lasso Model

In [199]:
X = tester

In [200]:
# X = training_data[["Before Population Density per square kilometre", 
#              "Before Dwellings", 
#              "Before Average number of bedrooms per dwelling", 
#              "Before Apartment, building that has five or more storeys",
#              "Before Total Occupied Private Dwellings",
#              "After Population Density per square kilometre", 
#              "After Dwellings", 
#              "After Average number of bedrooms per dwelling", 
#              "After Apartment, building that has five or more storeys",
#              "After Total Occupied Private Dwellings",
#             ]]

In [201]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.2, random_state=53)

In [202]:
kf = KFold(n_splits=5, shuffle=True, random_state=53)  # 5 folds
model = Ridge()

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y2.iloc[train_index], y2.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 0.0163246498045656
Mean R² across folds: 0.13521796422281726


In [203]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [204]:
y_pred = model_r.predict(X_test)

In [205]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 0.008781745973464667
R² Score: 0.46464498560378287
Intercept: 0.6774598212212279
Coefficients: [-0.04856245 -0.00404289  0.0599072  -0.00708013  0.03497257  0.0151378
  0.01656978  0.04108378 -0.01837914 -0.04334729  0.01560868]


In [206]:
model_l = Lasso()
model_l.fit(X_train, y_train)

In [207]:
y_pred = model_l.predict(X_test)

In [208]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778
Intercept: 0.6874835120843216
Coefficients: [-0. -0.  0. -0.  0. -0.  0.  0.  0. -0.  0.]


## Decision Tree Regressor

In [209]:
from sklearn.tree import DecisionTreeRegressor


In [210]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [211]:
y_pred = model_l.predict(X_test)

In [212]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778


## Random Forest

In [213]:
from sklearn.ensemble import RandomForestRegressor


In [214]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [215]:
y_pred = model_rf.predict(X_test)

In [216]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.001554724400099714
R² Score: 0.9052204987353838


## XGBoost


In [217]:
from xgboost import XGBRegressor

In [218]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [219]:
y_pred = model_xgb.predict(X_test)

In [220]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 3.3577433381545915e-07
R² Score: 0.9999795304403183


In [222]:
model_xgb.feature_importances_

array([0.00581239, 0.03052722, 0.00544253, 0.01386585, 0.0929352 ,
       0.02080117, 0.07286022, 0.3752305 , 0.19855437, 0.18069768,
       0.00327294], dtype=float32)