## Setup

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor

## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
df = df.dropna(subset=['n_movers_out_Lowincome'])

In [4]:
df = df.dropna(subset=['n_stayers'])

## New Model EDA

In [5]:
original_population = df["n_movers_out"] + df["n_stayers"]
original_low_income_population = df["n_movers_out_Lowincome"] + df["n_stayers_Lowincome"]

In [6]:
y1 = df["n_movers_out"] / original_population
y2 = df["n_movers_out_Lowincome"] / original_low_income_population

In [7]:
before_df = df.filter(regex=r'^(Before)', axis=1)
after_df = df.filter(regex=r'^(After)', axis=1)

In [8]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [9]:
calculating_average_bedrooms(before_df)
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bed

In [10]:
cols_to_drop = before_df.filter(regex='bedrooms?$', axis=1).columns
before_df = before_df.drop(columns = cols_to_drop)
cols_to_drop = after_df.filter(regex='bedrooms?$', axis=1).columns
after_df = after_df.drop(columns = cols_to_drop)

In [11]:
before_df = before_df.drop(["Before Apartment", "Before Other dwelling", "Before Other single-attached house"], axis=1)
after_df = after_df.drop(["After Other dwelling", "After Other attached dwelling", "After Other single-attached house"], axis=1)

## Missing Values

In [12]:
def impute_dwellings(df, before=True):
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [13]:
impute_dwellings(before_df)
impute_dwellings(after_df, False)

## Imputing with KNN

In [14]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [15]:
# These aren't great, but if I get rid of it it's only 3 rows that are removed

def kkn_imputation(df):
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

In [16]:
before_imputed = kkn_imputation(before_df)
after_imputed = kkn_imputation(after_df)

In [17]:
combined_df = pd.concat([before_imputed,after_imputed], axis = 1)

In [18]:
before_imputed_corr = before_imputed.copy()
after_imputed_corr = after_imputed.copy()

In [19]:
before_imputed.columns = before_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)
after_imputed.columns = after_imputed.columns.str.replace(r'^\w+\s+', 'Change ', regex=True)

In [20]:
difference_df = after_imputed - before_imputed

## Removing Collinearity

In [25]:
vif_df = pd.DataFrame()
vif_df["feature"] = combined_df.columns
vif_df["VIF"] = [variance_inflation_factor(combined_df.values, i) for i in range(combined_df.shape[1])]

In [26]:
vif_df

Unnamed: 0,feature,VIF
0,Before Population Density per square kilometre,126.00302
1,Before Dwellings,5274.018197
2,Before Total Occupied Private Dwellings,89777.101297
3,Before Single-detached house,1717.364252
4,Before Semi-detached house,70.05247
5,Before Row house,75.559125
6,"Before Apartment, duplex",102.47619
7,"Before Apartment, building that has fewer than...",4099.429366
8,"Before Apartment, building that has five or mo...",10055.419688
9,Before Movable dwelling,2.575322


## Preparing Training Data

In [83]:
combined_df["original_population"] = original_population.reset_index()[0]
combined_df["original_low_income_population"] = original_low_income_population.reset_index()[0]

In [84]:
def log_and_standardize(df):
    log_df = np.log(df + 1)
    standardized_df = (log_df - log_df.mean()) / log_df.std()
    return standardized_df

In [85]:
training_data = log_and_standardize(combined_df)

## Ridge & Lasso Model

In [87]:
X = training_data

In [88]:
# X = training_data[["Before Population Density per square kilometre", 
#              "Before Dwellings", 
#              "Before Average number of bedrooms per dwelling", 
#              "Before Apartment, building that has five or more storeys",
#              "Before Total Occupied Private Dwellings",
#              "After Population Density per square kilometre", 
#              "After Dwellings", 
#              "After Average number of bedrooms per dwelling", 
#              "After Apartment, building that has five or more storeys",
#              "After Total Occupied Private Dwellings",
#             ]]

In [110]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.2, random_state=53)

In [111]:
kf = KFold(n_splits=5, shuffle=True, random_state=53)  # 5 folds
model = Ridge(alpha = 1000)

mse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y2.iloc[train_index], y2.iloc[val_index]

    # Train model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mse_scores.append(mse)
    r2_scores.append(r2)

print("Mean MSE across folds:", sum(mse_scores) / len(mse_scores))
print("Mean R² across folds:", sum(r2_scores) / len(r2_scores))

Mean MSE across folds: 0.020069983599491994
Mean R² across folds: -0.05227676695254544


In [151]:
model_r = Ridge()
model_r.fit(X_train, y_train)

In [152]:
y_pred = model_r.predict(X_test)

In [153]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_r.intercept_)
print("Coefficients:", model_r.coef_)

Mean Squared Error (MSE): 0.00454017657448788
R² Score: 0.7232206098034764
Intercept: 0.6820053031455012
Coefficients: [-0.06295931 -0.01209883 -0.0103568  -0.0272717  -0.03006349 -0.02800367
 -0.0420822   0.02480095 -0.04438871 -0.04380733 -0.00785384  0.00503576
  0.06486049 -0.04790609 -0.00800583  0.02037325  0.00651234  0.02384886
  0.00646866  0.08553239 -0.0656482   0.00161619  0.03942709  0.04316411
  0.03250869 -0.02495744  0.07815383 -0.01683431]


In [154]:
model_l = Lasso()
model_l.fit(X_train, y_train)

In [155]:
y_pred = model_l.predict(X_test)

In [156]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Intercept:", model_l.intercept_)
print("Coefficients:", model_l.coef_)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778
Intercept: 0.6874835120843216
Coefficients: [-0. -0. -0.  0.  0. -0.  0. -0. -0. -0. -0. -0. -0. -0. -0. -0.  0. -0.
 -0.  0. -0. -0.  0.  0. -0. -0.  0.  0.]


## Decision Tree Regressor

In [118]:
from sklearn.tree import DecisionTreeRegressor


In [119]:
model_dt = DecisionTreeRegressor(max_depth=5)
model_dt.fit(X_train, y_train)

In [120]:
y_pred = model_l.predict(X_test)

In [121]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.018815233692681517
R² Score: -0.14701902501509778


## Random Forest

In [122]:
from sklearn.ensemble import RandomForestRegressor


In [123]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [124]:
y_pred = model_rf.predict(X_test)

In [125]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.0015536986212870444
R² Score: 0.9052830325222507


## XGBoost


In [158]:
from xgboost import XGBRegressor

In [159]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [160]:
y_pred = model_xgb.predict(X_test)

In [161]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 4.3891009107648926e-07
R² Score: 0.9999732430522544


In [134]:
pd.concat([y_pred, y_test],axis=1)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [109]:
model_xgb.feature_importances_

array([1.9423955e-03, 1.2452594e-02, 2.7092163e-05, 2.1726615e-03,
       6.9147539e-03, 1.1225638e-02, 1.4560776e-02, 3.6891140e-02,
       7.3888032e-03, 5.3502113e-04, 1.2594432e-03, 4.9536984e-04,
       3.1648409e-03, 4.5546144e-01, 3.8399871e-02, 0.0000000e+00,
       9.1189034e-03, 4.5567332e-03, 2.3527397e-02, 1.3662074e-03,
       2.6367591e-03, 1.9832593e-04, 2.4302411e-03, 2.1109271e-01,
       1.2698900e-03, 6.4687099e-04, 5.3002941e-03, 1.4496391e-01],
      dtype=float32)

In [53]:
X_doing_something = X.drop(["const"],axis=1)

KeyError: "['const'] not found in axis"

In [54]:
feature_coefs = {}

In [145]:
keys = X_doing_something.columns
values = model_xgb.feature_importances_

result = dict(zip(keys, values))

In [146]:
result

{'Before Population Density per square kilometre': np.float32(0.0019423955),
 'Before Dwellings': np.float32(0.012452594),
 'Before Total Occupied Private Dwellings': np.float32(2.7092163e-05),
 'Before Single-detached house': np.float32(0.0021726615),
 'Before Semi-detached house': np.float32(0.006914754),
 'Before Row house': np.float32(0.011225638),
 'Before Apartment, duplex': np.float32(0.014560776),
 'Before Apartment, building that has fewer than five storeys': np.float32(0.03689114),
 'Before Apartment, building that has five or more storeys': np.float32(0.007388803),
 'Before Movable dwelling': np.float32(0.0005350211),
 'Before Average number of bedrooms per dwelling': np.float32(0.0012594432),
 'Before Owned': np.float32(0.00049536984),
 'Before Rented': np.float32(0.0031648409),
 'After Population Density per square kilometre': np.float32(0.45546144),
 'After Dwellings': np.float32(0.03839987),
 'After Total Occupied Private Dwellings': np.float32(0.0),
 'After Single-detac

In [134]:
result[9].item

AttributeError: 'list' object has no attribute 'item'

In [147]:
sorted_dict = dict(sorted(result.items(), key=lambda item: item[1], reverse=True))

In [148]:
sorted_dict

{'After Population Density per square kilometre': np.float32(0.45546144),
 'After Average number of bedrooms per dwelling': np.float32(0.21109271),
 'original_low_income_population': np.float32(0.1449639),
 'After Dwellings': np.float32(0.03839987),
 'Before Apartment, building that has fewer than five storeys': np.float32(0.03689114),
 'After Row house': np.float32(0.023527397),
 'Before Apartment, duplex': np.float32(0.014560776),
 'Before Dwellings': np.float32(0.012452594),
 'Before Row house': np.float32(0.011225638),
 'After Single-detached house': np.float32(0.009118903),
 'Before Apartment, building that has five or more storeys': np.float32(0.007388803),
 'Before Semi-detached house': np.float32(0.006914754),
 'original_population': np.float32(0.005300294),
 'After Semi-detached house': np.float32(0.004556733),
 'Before Rented': np.float32(0.0031648409),
 'After Apartment, building that has fewer than five storeys': np.float32(0.002636759),
 'After Movable dwelling': np.float3

In [21]:
X_2 = X.copy()

NameError: name 'X' is not defined

In [169]:
X_2.drop(["const", "Before Dwellings", "After Dwellings"], axis=1, inplace=True)

In [170]:
vif_df = pd.DataFrame()
vif_df["feature"] = X_2.columns
vif_df["VIF"] = [variance_inflation_factor(X_2.values, i) for i in range(X_2.shape[1])]