In [None]:
def calculating_average_bedrooms(df, before = True):
    """
    Input the dataframe and whether the columns are 'before' or 'after'

    This will calculate the approximate average number of bedrooms per dwelling
    """
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [None]:
def impute_dwellings(df, before=True):
    """
    Input the dataframe and whether the columns are 'before' or 'after'

    This will fill in the missing values of 'Dwellings' given 'Total Occupied Private Dwellings', 'Owned', 'Rented'
    This imputation has a very high accuracy so it is useful
    """
    
    word = "Before" if before else "After"
    df_known = df[df[f'{word} Dwellings'].notna()]
    df_missing = df[df[f'{word} Dwellings'].isna()]
    X_train = df_known[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_train = df_known[f"{word} Dwellings"]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_missing = df_missing[[f"{word} Total Occupied Private Dwellings", f"{word} Owned", f"{word} Rented"]]
    y_pred = model.predict(X_missing)
    df.loc[df[f'{word} Dwellings'].isna(), f'{word} Dwellings'] = y_pred

In [None]:
def kkn_imputation(df):
    """
    Input the dataframe 

    This will fill in the missing values given the 3 nearest neighbors

    """
    imputer = KNNImputer(n_neighbors=3) 
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

In [None]:
def log_and_standardize(df):
    """
    Input the dataframe 

    This will take logs of all the columns, useful since the columns are all right skewed
    Then it will standardize the data, useful for understanding the impact of each feature
    """
    log_df = np.log(df + 1)
    standardized_df = (log_df - log_df.mean()) / log_df.std()
    return standardized_df