## Setup

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## EDA

In [2]:
df = pd.read_csv('../data/tod-on-main.csv')

In [3]:
after_df = df.filter(regex=r'^(After)', axis=1)

In [8]:
after_df.isnull().sum().sort_values(ascending=False)

After Other dwelling                                          54
After Other attached dwelling                                 47
After No bedrooms                                             41
After 1 bedroom                                               41
After 2 bedrooms                                              28
After 4 or more bedrooms                                      28
After 3 bedrooms                                              28
After 0 to 1 bedroom                                          28
After Other single-attached house                              3
After Dwellings                                                3
After Average number of bedrooms per dwelling                  1
After Semi-detached house                                      1
After Row house                                                1
After Apartment, duplex                                        1
After Apartment, building that has fewer than five storeys     1
After Owned              

In [6]:
def calculating_average_bedrooms(df, before = True):
    word = "Before" if before else "After"
    null_rows = df[df[f"{word} Average number of bedrooms per dwelling"].isnull()]
    weighted_sum = (0.5 * null_rows[f"{word} 0 to 1 bedroom"] + 2 * null_rows[f"{word} 2 bedrooms"] + 3 * null_rows[f"{word} 3 bedrooms"] + 4.5 * null_rows[f"{word} 4 or more bedrooms"])
    total = (null_rows[[f"{word} 0 to 1 bedroom", f"{word} 2 bedrooms", f"{word} 3 bedrooms", f"{word} 4 or more bedrooms"]].sum(axis=1))
    null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
    df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]

In [7]:
calculating_average_bedrooms(after_df, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows[f"{word} Average number of bedrooms per dwelling"] = weighted_sum / total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[null_rows.index, f"{word} Average number of bedrooms per dwelling"] = null_rows[f"{word} Average number of bedrooms per dwelling"]


In [9]:
after_df.drop(["After Other dwelling", "After Other attached dwelling", "After No bedrooms", "After 1 bedroom"], axis=1)

Unnamed: 0,After Population Density per square kilometre,After Dwellings,After Total Occupied Private Dwellings,After Single-detached house,After Semi-detached house,After Row house,"After Apartment, duplex","After Apartment, building that has fewer than five storeys","After Apartment, building that has five or more storeys",After Other single-attached house,After Movable dwelling,After 0 to 1 bedroom,After 2 bedrooms,After 3 bedrooms,After 4 or more bedrooms,After Average number of bedrooms per dwelling,After Owned,After Rented
0,30652.796998,2243.279704,2159.859594,993.815921,161.958617,234.228216,196.057126,465.721264,88.992001,0.000000,0.075891,199.194060,608.987549,692.896699,677.517747,2.958369,1250.364815,926.236364
1,9366.250730,1358.802156,1338.971452,1007.011309,95.386329,133.722028,2.103387,102.108253,0.000000,0.000000,0.000000,39.070054,180.014756,404.464073,681.257482,3.570350,1203.072199,109.974286
2,27411.512344,1579.446613,1520.604342,491.713881,219.862074,356.921142,33.438833,414.451887,0.192841,0.000000,0.000000,,,,,19.964445,886.394312,643.709179
3,3193.942538,,1773.463965,955.654363,30.856989,86.576500,40.805173,401.268184,258.171781,,0.000000,,,,,3.806435,1070.279938,705.160154
4,20706.143603,1364.403976,1321.821135,883.629555,7.977719,198.702442,10.010645,226.559643,0.000000,0.000000,0.000000,,,,,23.760836,1168.193247,149.384926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,12909.067304,832.508999,801.531033,410.126257,5.609396,234.635346,122.687089,37.692522,0.000000,0.000000,0.000000,,,,,13.276306,650.728001,134.041126
66,491300.057118,16785.007937,14515.982175,4.993694,0.000225,199.587318,2.637503,568.764027,13723.011730,1.308826,0.000000,9252.559717,4592.270217,578.636521,82.415850,1.097320,5465.257466,9009.933329
67,240712.729871,6597.507166,6186.713127,156.501338,113.449406,128.966314,442.721688,4746.458381,586.231397,5.521749,0.000000,,,,,28.469771,1773.756287,4388.696640
68,88087.300720,6475.031002,5304.043905,0.000000,0.000000,63.224946,0.000000,336.436187,4891.004562,0.000000,0.000000,3201.160637,1684.058295,264.072614,40.897192,1.145421,2017.537254,3173.313805


In [12]:
df.iloc[:,0:20].columns

Index(['station_fid', 'Y', 'X', 'stop_label_x', 'line_label_x', 'technology_x',
       'PRUID', 'opening_date', 'construction_date', 'announcement_date',
       'City', 'n_movers_in', 'n_movers_in_with_child',
       'n_movers_in_Lowincome', 'movers_in_med_income', 'movers_in_med_age',
       'n_movers_out', 'n_movers_out_with_child', 'n_movers_out_Lowincome',
       'movers_out_med_income'],
      dtype='object')