## Setup

In [238]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## EDA

In [180]:
df = pd.read_csv('../data/tod-on-main.csv')

In [214]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [216]:
before_df = df.filter(regex=r'^(Before)', axis=1)
# before_df = before_df.drop(["Before 4 bedrooms", "Before 5 or more bedrooms", 'Before No bedrooms', 'Before 1 bedroom', "Before Apartment"], axis=1)

In [217]:
after_df = df.filter(regex=r'^(After)', axis=1)
# after_df = after_df.drop(['After No bedrooms', 'After 1 bedroom', 'After Other attached dwelling'], axis=1)

In [218]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [219]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [220]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [221]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling"], axis=1)

In [222]:
before_df.columns

Index(['Before Population Density per square kilometre', 'Before Dwellings',
       'Before Total Occupied Private Dwellings',
       'Before Single-detached house', 'Before Semi-detached house',
       'Before Row house', 'Before Apartment, duplex',
       'Before Apartment, building that has fewer than five storeys',
       'Before Apartment, building that has five or more storeys',
       'Before Other single-attached house', 'Before Movable dwelling',
       'Before Average number of bedrooms per dwelling', 'Before Owned',
       'Before Rented'],
      dtype='object')

In [223]:
after_df.columns

Index(['After Population Density per square kilometre', 'After Dwellings',
       'After Total Occupied Private Dwellings', 'After Single-detached house',
       'After Semi-detached house', 'After Row house',
       'After Apartment, duplex',
       'After Apartment, building that has fewer than five storeys',
       'After Apartment, building that has five or more storeys',
       'After Other single-attached house', 'After Movable dwelling',
       'After Average number of bedrooms per dwelling', 'After Owned',
       'After Rented'],
      dtype='object')

## Imputing with KNN

In [224]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [225]:
# This is just in case you want to check how reasonable it is
# df_unscaled = pd.DataFrame(scaler.inverse_transform(imputed_df), columns=imputed_df.columns)
# df_unscaled.head()

In [226]:
def kkn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [227]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [228]:
after_imputed

Unnamed: 0,After Population Density per square kilometre,After Dwellings,After Total Occupied Private Dwellings,After Single-detached house,After Semi-detached house,After Row house,"After Apartment, duplex","After Apartment, building that has fewer than five storeys","After Apartment, building that has five or more storeys",After Other single-attached house,After Movable dwelling,After Average number of bedrooms per dwelling,After Owned,After Rented
0,-0.382479,-0.355497,-0.306823,1.161212,0.372251,0.436658,0.328344,-0.350722,-0.415318,-0.323217,-0.265124,-0.581972,-0.266324,-0.306345
1,-0.531293,-0.599815,-0.565661,1.195441,0.000959,-0.247834,-0.754908,-0.636342,-0.446403,-0.323217,-0.275400,-0.517583,-0.303762,-0.707811
2,-0.405139,-0.538866,-0.508389,-0.141244,0.695194,1.272251,-0.579896,-0.390995,-0.446336,-0.323217,-0.275400,1.207301,-0.554449,-0.445302
3,-0.574444,-0.585538,-0.428659,1.062221,-0.358938,-0.568916,-0.538754,-0.401351,-0.356224,-0.256747,-0.275400,-0.492743,-0.408883,-0.415078
4,-0.452016,-0.598267,-0.571069,0.875388,-0.486542,0.194712,-0.710745,-0.538585,-0.446403,-0.323217,-0.275400,1.606733,-0.331373,-0.688427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,-0.506525,-0.745191,-0.735124,-0.352883,-0.499751,0.439430,-0.081435,-0.686941,-0.446403,-0.323217,-0.275400,0.503617,-0.741007,-0.695974
64,2.837904,3.661334,3.589249,-1.403800,-0.531035,0.200738,-0.751925,-0.269782,4.347049,-0.158673,-0.275400,-0.777779,3.070253,3.669497
65,1.086049,0.847262,0.962905,-1.010788,0.101702,-0.280222,1.705993,3.011817,-0.241633,0.370969,-0.275400,2.102177,0.148001,1.396613
66,0.019046,0.813431,0.684586,-1.416754,-0.531036,-0.727950,-0.766656,-0.452276,1.262026,-0.323217,-0.275400,-0.772719,0.340982,0.798845


In [229]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [230]:
difference_df = after_imputed - before_imputed

## Baseline Model

In [232]:
y = df["n_movers_out_Lowincome"]
X = difference_df

In [234]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

In [235]:
model = LinearRegression()
model.fit(X_train, y_train)

In [236]:
y_pred = model.predict(X_test)

In [239]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [240]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 140534946.83296606
R² Score: 0.2825752625941058
Intercept: 12760.917881451107
Coefficients: [  14466.22583806   23853.21007969 -168920.53149597     833.36682455
  -10764.76981968    2096.19706216    4641.78460242    5168.31002196
   25697.013035      2817.68577827   -3813.78681725   -1794.61565274
   66580.26828108   69003.68039518]


In [242]:
140534946.83296606 < 812037140.855561

True