In [51]:
import pandas as pd
import numpy as np

In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
len(df)

20640

In [53]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [54]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [55]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

In [56]:
len(df)

15687

In [57]:
list(df.columns.values)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [83]:
features = ['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [59]:
df = df[features]
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [60]:
df.isnull().count()

longitude             15687
latitude              15687
housing_median_age    15687
total_rooms           15687
total_bedrooms        15687
population            15687
households            15687
median_income         15687
median_house_value    15687
dtype: int64

In [61]:
df.isnull().any(axis=0)

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
dtype: bool

In [64]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [62]:
df.population.describe()

count    15687.000000
mean      1466.317205
std       1180.389908
min          3.000000
25%        802.000000
50%       1195.000000
75%       1777.000000
max      35682.000000
Name: population, dtype: float64

In [63]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

n_val, n_test, n_train

(3137, 3137, 9413)

In [66]:
idx = np.arange(n)
idx

array([    0,     1,     2, ..., 15684, 15685, 15686])

In [67]:
np.random.seed(2)
np.random.shuffle(idx)
idx

array([ 7820,  6855, 12244, ...,  6637,  2575,  7336])

In [68]:
df_val = df.iloc[idx[:n_val]]
df_test = df.iloc[idx[n_val:n_val+n_test]]
df_train = df.iloc[idx[n_val+n_test:]]

In [69]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [70]:
y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)

del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

In [73]:
df_train.total_bedrooms.describe()

count    9321.000000
mean      542.698745
std       432.749544
min         2.000000
25%       295.000000
50%       432.000000
75%       649.000000
max      6445.000000
Name: total_bedrooms, dtype: float64

In [96]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

In [92]:
def rmse(y, y_pred):
    err = y - y_pred
    se = err ** 2
    mse = se.mean()
    return np.sqrt(mse)


In [94]:
def prepare_X(df, bedroom_fill_value):
    df = df.copy()
    
    df['total_bedrooms'] = df['total_bedrooms'].fillna(bedroom_fill_value)

    return df.values

In [105]:
X_train = prepare_X(df_train, 2)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, 2)
y_pred = w0 + X_val.dot(w)
print(rmse(y_val, y_pred))

0.3531123835313934


In [104]:
X_train = prepare_X(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, 0)
y_pred = w0 + X_val.dot(w)
print(rmse(y_val, y_pred))

0.35311378984276603


In [103]:
3531123835313934 > 35311378984276603

False

In [106]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [113]:
maxm = 1e6
maxr = -1

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    
    X_val = prepare_X(df_val, 0)
    y_pred = w0 +X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    if score < maxm:
        maxm = score
        maxr = r
    
    print(r, w0, score)

print(maxr, maxm)

0 -10.113169459379442 0.35311378984276603
1e-06 -10.113148354734687 0.3531137916421357
0.0001 -10.111059436482156 0.35311397009191664
0.001 -10.092108771953177 0.3531156181308498
0.01 -9.906436134979625 0.353134541014222
0.1 -8.366997789772444 0.3534853191825487
1 -3.274841002968598 0.3570918500960215
5 -0.8819617336156604 0.3600599189291187
10 -0.4597181546753508 0.3606660546937185
0 0.35311378984276603


In [118]:
rmses = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_val = df.iloc[idx[:n_val]]
    df_test = df.iloc[idx[n_val:n_val+n_test]]
    df_train = df.iloc[idx[n_val+n_test:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = np.log1p(df_train.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)

    del df_train['median_house_value']
    del df_test['median_house_value']
    del df_val['median_house_value']
    
    X_train = prepare_X(df_train, 0)
    X_val = prepare_X(df_val, 0)

    w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    print(seed, score)
    rmses.append(score)
    
round(np.std(rmses), 3)

0 0.32786459763562403
1 0.338914831534438
2 0.35311378984276603
3 0.3411878083623196
4 0.34522093757660727
5 0.337000938195753
6 0.33232659325863806
7 0.3309279267120406
8 0.3286897318689901
9 0.3462490350938704


0.008

In [121]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_val = df.iloc[idx[:n_val]]
df_test = df.iloc[idx[n_val:n_val+n_test]]
df_train = df.iloc[idx[n_val+n_test:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)

del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

X_train = prepare_X(df_train, 0)
X_val = prepare_X(df_val, 0)

w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)



X_test = prepare_X(df_test, 0)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score



0.3379333870692018