## Setup

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold

from xgboost import XGBRegressor

## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
df = df.dropna(subset=['n_stayers'])

In [5]:
df = df.dropna(subset=['n_stayers_Lowincome'])

## New Model EDA

In [6]:
y1 = df["n_movers_out"]
y2 = df["n_movers_out_Lowincome"]
y3 = df["n_movers_in"]
y4 = df["n_movers_in_Lowincome"]

In [7]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [8]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [9]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [10]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [11]:
# Remove other dwelling and other single-attached (co-linearity) 
# & remove apartment (remove bedrooms, maybe add it back again later)

In [12]:
# Do we maybe even remove Dwellings since there is collinearity?

In [13]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Adding original Population

In [14]:
# Can't even include this because there are missing values
original_population = df["n_movers_out"] + df["n_stayers"]
original_low_income_population = df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"]

## Missing Values

In [15]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [16]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [17]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [18]:
# These aren't great, but if I get rid of it it's only 3 rows that are removed

def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [19]:
def standardize(df):
    scaler = StandardScaler()
    standardized_array = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(standardized_array, columns=df.columns)
    return scaled_df

In [20]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [21]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [22]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [23]:
difference_df = after_imputed - before_imputed

## Preparing Training Data

In [24]:
combined_df["original_population"] = original_population.reset_index()[0]
combined_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [25]:
combined_df_scaled = standardize(combined_df)

In [26]:
combined_df_scaled.columns

Index(['Before Population Density per square kilometre', 'Before Dwellings',
       'Before Total Occupied Private Dwellings',
       'Before Single-detached house', 'Before Semi-detached house',
       'Before Row house', 'Before Apartment, duplex',
       'Before Apartment, building that has fewer than five storeys',
       'Before Apartment, building that has five or more storeys',
       'Before Movable dwelling',
       'Before Average number of bedrooms per dwelling', 'Before Owned',
       'Before Rented', 'After Population Density per square kilometre',
       'After Dwellings', 'After Total Occupied Private Dwellings',
       'After Single-detached house', 'After Semi-detached house',
       'After Row house', 'After Apartment, duplex',
       'After Apartment, building that has fewer than five storeys',
       'After Apartment, building that has five or more storeys',
       'After Movable dwelling',
       'After Average number of bedrooms per dwelling', 'After Owned',
     

In [101]:
X = combined_df_scaled

In [61]:
X = combined_df_scaled[["Before Population Density per square kilometre", 
             "Before Dwellings", 
             "Before Average number of bedrooms per dwelling", 
             "Before Apartment, building that has five or more storeys",
             "Before Total Occupied Private Dwellings",
             "After Population Density per square kilometre", 
             "After Dwellings", 
             "After Average number of bedrooms per dwelling", 
             "After Apartment, building that has five or more storeys",
             "After Total Occupied Private Dwellings", 'original_population',
       'original_low_income_population'
            ]]

In [131]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2)

## XGBoost

In [132]:
kf = KFold(n_splits=5, shuffle=True)  # 5 folds
model = XGBRegressor()

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y1.iloc[train_index], y1.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 867606255.8078516
Mean R² across folds: 0.4738794331057875


In [133]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [134]:
y_pred = model_xgb.predict(X_test)

In [135]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 48781746.008068435
R² Score: 0.9619217486643684


## Secondary EDA

## Simpler Regression