In [464]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [465]:
def introduce_missing(data, missing_fraction=0.2, random_state=53):
    np.random.seed(random_state)
    data_missing = data.copy()
    mask = np.random.rand(*data.shape) < missing_fraction
    data_missing[mask] = np.nan
    return data_missing, mask

# data_missing, mask = introduce_missing(data, missing_fraction=0.2)

In [466]:
def knn_imputation(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(scaled_df), columns=scaled_df.columns)
    return imputed_df

In [467]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [468]:
# Need to test the accuracy of the knn_imputation and the impute_dwellings linear regression imputation

## Getting Data

In [287]:
df = pd.read_csv('../data/tod-on-main.csv')
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [288]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [289]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

calculating_average_bedrooms(before_df, True)
calculating_average_bedrooms(after_df, False)
warnings.filterwarnings('ignore')

In [290]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)

cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [291]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

# Remove those with very few values and those that will cause collinearity (e.g. adding all 'others' will sum to total dwellings)

## Testing Dwellings imputer

In [23]:
before_df

Unnamed: 0,Before Population Density per square kilometre,Before Dwellings,Before Total Occupied Private Dwellings,Before Single-detached house,Before Semi-detached house,Before Row house,"Before Apartment, duplex","Before Apartment, building that has fewer than five storeys","Before Apartment, building that has five or more storeys",Before Movable dwelling,Before Average number of bedrooms per dwelling,Before Owned,Before Rented
0,29557.199802,2165.500341,2098.182095,1036.355861,171.559124,232.989487,157.291638,408.144983,89.456940,0.081414,27.854093,1246.574931,830.112341
1,10145.742414,1327.753873,1249.277296,1006.500775,107.212997,90.911065,0.000000,42.502658,0.000000,0.000000,14.014679,1193.947023,55.330273
2,2126.628020,,1542.772777,820.764067,78.968492,394.414058,17.428140,218.439627,0.000000,9.587018,2.778799,1025.148086,515.821211
3,3248.000589,,1701.261038,950.684979,,,,,244.245214,0.000000,,993.569877,707.963628
4,1828.403352,,1362.387422,960.920963,44.427922,148.984013,0.008759,117.349941,0.000000,91.828328,2.427691,1129.062316,233.895241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,1228.237325,,989.580227,477.733309,23.307584,123.090774,59.716868,204.166021,93.101270,4.177069,1.753427,615.661707,374.353186
66,354654.913559,10424.569320,8067.041821,12.516975,0.025007,76.844294,7.377234,783.636264,7180.672595,5.000000,17.185891,2909.503632,4752.920869
67,12609.345353,,5040.953408,418.855670,71.411712,35.424626,549.461631,3726.111891,226.284939,0.000000,2.019597,1475.624331,3568.855256
68,53410.248135,3911.465391,2301.405997,5.000000,0.000000,13.372768,29.009326,634.370624,1621.213439,0.000000,3.515521,387.682813,1570.343760


In [65]:
test_before_dwellings_df = before_df[before_df["Before Dwellings"].notna()]
test_after_dwellings_df = after_df[after_df["After Dwellings"].notna()]

In [66]:
X_before = test_before_dwellings_df[["Before Total Occupied Private Dwellings", "Before Owned", "Before Rented"]]
y_before = test_before_dwellings_df["Before Dwellings"]

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X_before, y_before, test_size=0.2, random_state=53)

In [85]:
model = LinearRegression()
model.fit(X_train, y_train)

In [86]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [87]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 16352.307228648424
R² Score: 0.9910236703399335
Intercept: 123.3901699067037
Coefficients: [ 5.09337518 -4.19276064 -3.99311293]


In [111]:
X_after = test_after_dwellings_df[["After Total Occupied Private Dwellings", "After Owned", "After Rented"]]
y_after = test_after_dwellings_df["After Dwellings"]

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_after, y_after, test_size=0.2, random_state=53)

In [113]:
model = LinearRegression()
model.fit(X_train, y_train)

In [114]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [115]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 42759.62828218502
R² Score: 0.9973566298163038
Intercept: -15.259777307582226
Coefficients: [ 2.42860222 -1.41410124 -1.24637436]


In [125]:
together_before_df.columns = together_before_df.columns.str.replace(r'^\w+\s+', '', regex=True)
together_after_df.columns = together_after_df.columns.str.replace(r'^\w+\s+', '', regex=True)

In [126]:
together_df = pd.concat([together_before_df, together_after_df], ignore_index=True)

In [127]:
X_together = together_df[["Total Occupied Private Dwellings", "Owned", "Rented"]]
y_together = together_df["Dwellings"]

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X_together, y_together, test_size=0.25, random_state=53)

In [165]:
model = LinearRegression()
model.fit(X_train, y_train)

In [166]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [167]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Mean Squared Error (MSE): 67894.09053589568
R² Score: 0.9974066382878733
Intercept: -0.9989322741575961
Coefficients: [ 4.08989095 -3.09971367 -2.90525437]


In [268]:
model_r = Ridge(alpha=100000)
model_r.fit(X_train, y_train)

In [269]:
y_pred = model_r.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [270]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 59793.30458311996
R² Score: 0.9977160653375952
Intercept: 19.652240219144005
Coefficients: [ 2.45254268 -1.48258833 -1.2463702 ]


In [244]:
model_l = Lasso(alpha=10000)
model_l.fit(X_train, y_train)

In [245]:
y_pred = model_l.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [246]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 91185.0683498459
R² Score: 0.9965169889881489
Intercept: 15.60548870273351
Coefficients: [ 1.20600819 -0.22183012  0.        ]


In [275]:
from sklearn.linear_model import RidgeCV

# RidgeCV with predefined alpha values and 5-fold CV
ridge_cv = RidgeCV(alphas=[10000000, 100000000], cv=5, scoring='neg_mean_squared_error')

# Fit
ridge_cv.fit(X_together, y_together)

# Best alpha and model performance
print("Best alpha:", ridge_cv.alpha_)
print("Best score (neg MSE):", ridge_cv.best_score_)

Best alpha: 10000000
Best score (neg MSE): -114919.16096975395


### Testing the imputer

In [31]:
before_missing, before_mask = introduce_missing(before_df, missing_fraction=0.2)
after_missing, after_mask = introduce_missing(after_df, missing_fraction=0.2)

In [21]:
imputer = KNNImputer(n_neighbors=3)
before_imputed = pd.DataFrame(imputer.fit_transform(before_missing), columns=before_missing.columns)
before_imputed = pd.DataFrame(imputer.fit_transform(after_missing), columns=after_missing.columns)

In [24]:
original_before = before_missing.values[before_mask]
imputed_before = before_imputed.values[before_mask]

rmse = np.sqrt(mean_squared_error(original_before, imputed_before))
print(f'RMSE: {rmse:.4f}')

ValueError: Input contains NaN.

## Before and After df

In [292]:
impute_dwellings(before_df, before=True)
impute_dwellings(after_df, before=False)

In [294]:
before_df_clean = before_df.dropna()
after_df_clean = after_df.dropna()

In [297]:
before_df_missing = before_df_clean.copy()
after_df_missing = after_df_clean.copy()

In [343]:
before_mask = np.random.rand(*before_df_missing.shape) < 0.25  # 20% missing
after_mask = np.random.rand(*after_df_missing.shape) < 0.25  # 20% missing

In [302]:
before_df_missing = before_df_missing.mask(before_mask)
after_df_missing = after_df_missing.mask(after_mask)

In [303]:
imputer = KNNImputer(n_neighbors=3)
before_df_imputed = pd.DataFrame(imputer.fit_transform(before_df_missing), columns=before_df_missing.columns)

In [304]:
imputer = KNNImputer(n_neighbors=3)
after_df_imputed = pd.DataFrame(imputer.fit_transform(after_df_missing), columns=after_df_missing.columns)

In [315]:
mask_missing = before_mask  # Boolean array where True means "was missing"
mse = mean_squared_error(
    before_df_clean[mask_missing],  # true values
    before_df_imputed[mask_missing]  # imputed values
)
r2 = r2_score(before_df_clean[mask_missing], before_df_imputed[mask_missing])


print("MSE on imputed values:", mse)
print("R^2 on imputed values:", r2)

MSE on imputed values: 144513086.3250866
R^2 on imputed values: 0.8292024028638418


In [316]:
after_df_missing

Unnamed: 0,After Population Density per square kilometre,After Dwellings,After Total Occupied Private Dwellings,After Single-detached house,After Semi-detached house,After Row house,"After Apartment, duplex","After Apartment, building that has fewer than five storeys","After Apartment, building that has five or more storeys",After Movable dwelling,After Average number of bedrooms per dwelling,After Owned,After Rented
0,30652.796998,2243.279704,2159.859594,993.815921,161.958617,234.228216,196.057126,465.721264,88.992001,0.075891,2.958369,1250.364815,926.236364
1,9366.250730,1358.802156,,1007.011309,95.386329,133.722028,,102.108253,0.000000,,,1203.072199,109.974286
2,27411.512344,,,491.713881,219.862074,356.921142,33.438833,414.451887,0.192841,0.000000,19.964445,,643.709179
3,3193.942538,1903.703523,1773.463965,955.654363,30.856989,86.576500,40.805173,401.268184,258.171781,0.000000,3.806435,,705.160154
4,20706.143603,1364.403976,,,,,,226.559643,0.000000,0.000000,23.760836,1168.193247,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,12909.067304,832.508999,801.531033,410.126257,5.609396,234.635346,122.687089,37.692522,0.000000,0.000000,13.276306,650.728001,134.041126
66,491300.057118,16785.007937,,4.993694,0.000225,199.587318,2.637503,568.764027,,0.000000,1.097320,5465.257466,9009.933329
67,240712.729871,6597.507166,6186.713127,156.501338,,,442.721688,,586.231397,0.000000,28.469771,1773.756287,4388.696640
68,88087.300720,6475.031002,5304.043905,,0.000000,63.224946,,336.436187,4891.004562,,1.145421,,3173.313805


In [317]:
mask_missing = after_mask  # Boolean array where True means "was missing"
mse = mean_squared_error(
    after_df_clean[mask_missing],  # true values
    after_df_imputed[mask_missing]  # imputed values
)

r2 = r2_score(after_df_clean[mask_missing], after_df_imputed[mask_missing])


print("MSE on imputed values:", mse)
print("R^2 on imputed values:", r2)

MSE on imputed values: 45844659.052922286
R^2 on imputed values: 0.6545545319897432


In [439]:
k_values = range(3,11)
total_scores = {}
for k in k_values:
    mse_scores = []
    r2_scores = []
    for i in range(5):
        before_df_missing = before_df_clean.copy()
        before_mask = np.random.rand(*before_df_missing.shape) < 0.2
        before_df_missing = before_df_missing.mask(before_mask)
        imputer = KNNImputer(n_neighbors=k)
        before_df_imputed = pd.DataFrame(imputer.fit_transform(before_df_missing), columns=before_df_missing.columns)
        mask_missing = before_mask  # Boolean array where True means "was missing"
        mse = mean_squared_error(
            before_df_clean[mask_missing],  # true values
            before_df_imputed[mask_missing]  # imputed values
        )
        r2 = r2_score(before_df_clean[mask_missing], before_df_imputed[mask_missing])
        mse_scores.append(mse)
        r2_scores.append(r2)
    total_scores[k] = np.mean(mse_scores)

In [440]:
min(total_scores, key = total_scores.get)

7

In [457]:
k_values = range(3,11)
total_scores = {}
for k in k_values:
    mse_scores = []
    r2_scores = []
    for i in range(5):
        after_df_missing = after_df_clean.copy()
        after_mask = np.random.rand(*after_df_missing.shape) < 0.2
        after_df_missing = after_df_missing.mask(after_mask)
        imputer = KNNImputer(n_neighbors=k)
        after_df_imputed = pd.DataFrame(imputer.fit_transform(after_df_missing), columns=after_df_missing.columns)
        mask_missing = after_mask  # Boolean array where True means "was missing"
        mse = mean_squared_error(
            after_df_clean[mask_missing],  # true values
            after_df_imputed[mask_missing]  # imputed values
        )
        r2 = r2_score(after_df_clean[mask_missing], after_df_imputed[mask_missing])
        mse_scores.append(mse)
        r2_scores.append(r2)
    total_scores[k] = np.mean(mse_scores)

In [458]:
min(total_scores, key = total_scores.get)

4

In [462]:
df[df.isna().any(axis=1)]

In [463]:
after_df

Unnamed: 0,After Population Density per square kilometre,After Dwellings,After Total Occupied Private Dwellings,After Single-detached house,After Semi-detached house,After Row house,"After Apartment, duplex","After Apartment, building that has fewer than five storeys","After Apartment, building that has five or more storeys",After Movable dwelling,After Average number of bedrooms per dwelling,After Owned,After Rented
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,,,,,,,,,,,,,
66,,,,,,,,,,,,,
67,,,,,,,,,,,,,
68,,,,,,,,,,,,,
