## Import Library

In [235]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

## Import data

In [236]:
df = pd.read_csv('housing.csv')
display(df.shape)
df.head()

(20640, 10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [237]:
del df['ocean_proximity']

# Question 1

In [238]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

# Question 2

In [239]:
df.population.median()

1166.0

## Split the data

In [240]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [241]:
# df1 = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
#        'total_bedrooms', 'population', 'households', 'median_income',
#        ]]

In [242]:
np.random.seed(42)

n = len(df)

# Split your data in train/val/test sets, with 60%/20%/20% distribution.
n_val = int(0.2*n)
n_test = int(0.2*n)
n_train = n - (n_val + n_test)

idx = np.arange(n)

# Shuffle the initial dataset, use seed 42.
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
df_shuffled

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0
...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,229200.0
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,97800.0
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,222100.0
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0


In [243]:
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values


# Apply the log transformation to the median_house_value variable using the np.log1p() function.
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

# Make sure that the target value ('median_house_value') is not in your dataframe.
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Question 3

In [244]:
df_train.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
dtype: int64

## Training a linear regression model

In [245]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

## RMSE

In [246]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [247]:
df_train.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

## Fill total_bedrooms with 0

In [248]:
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']

X_train = df_train[base].fillna(0).values
X_train

array([[-1.1901e+02,  3.6060e+01,  2.5000e+01, ...,  1.3920e+03,
         3.5900e+02,  1.6812e+00],
       [-1.1946e+02,  3.5140e+01,  3.0000e+01, ...,  1.5650e+03,
         5.8400e+02,  2.5313e+00],
       [-1.2244e+02,  3.7800e+01,  5.2000e+01, ...,  1.3100e+03,
         9.6300e+02,  3.4801e+00],
       ...,
       [-1.2091e+02,  3.8980e+01,  1.3000e+01, ...,  3.2640e+03,
         1.1980e+03,  3.6530e+00],
       [-1.1772e+02,  3.4090e+01,  3.6000e+01, ...,  7.8500e+02,
         2.9900e+02,  3.2566e+00],
       [-1.2247e+02,  3.7760e+01,  3.4000e+01, ...,  1.1520e+03,
         4.4500e+02,  5.1893e+00]])

In [249]:
X_train = df_train[base].fillna(0).values

w0, w = train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)

rmse(y_train, y_pred)

0.34131359101566766

## Validating results

In [250]:
X_val = df_val[base].fillna(0).values

y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

0.3295330365227396

# Question 4

## Train a regularized linear regression

In [251]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

## Tuning Model

In [252]:
list_emp = []
for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:

    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    list_emp.append([r, score])


In [253]:
pd.DataFrame(list_emp,columns=['r','score']).sort_values(by='score')

Unnamed: 0,r,score
3,0.001,0.329533
2,0.0001,0.329533
1,1e-05,0.329533
0,0.0,0.329533
4,0.1,0.329695
5,1.0,0.333789
6,10.0,0.340606


# Question 5

In [292]:
base = [ 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'households', 'median_income']

In [293]:
empt_rmse = []
for seed_no in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    
    np.random.seed(seed_no)

    n = len(df)

    # Split your data in train/val/test sets, with 60%/20%/20% distribution.
    n_val = int(0.2*n)
    n_test = int(0.2*n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)

    # Shuffle the initial dataset, use seed seed_no.
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values


    # Apply the log transformation to the median_house_value variable using the np.log1p() function.
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    # Make sure that the target value ('median_house_value') is not in your dataframe.
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    X_train = df_train[base].fillna(0).values

    w0, w = train_linear_regression(X_train, y_train)

    # y_pred = w0 + X_train.dot(w)
    X_val = df_val[base].fillna(0).values

    y_pred = w0 + X_val.dot(w)
    val_rmse = rmse(y_val, y_pred)


    empt_rmse.append(val_rmse)
print(empt_rmse)

[0.3998015740426461, 0.4008031167905152, 0.3969809948940976, 0.40092994907985446, 0.4051278928791012, 0.41030167960714564, 0.4022614960870146, 0.3956451400414095, 0.41031437489276723, 0.39759808885752235]


In [294]:
round(np.std(np.array(empt_rmse)),3)

0.005

# Question 6

In [296]:
np.random.seed(9)

n = len(df)

# Split your data in train/val/test sets, with 60%/20%/20% distribution.
n_val = int(0.2*n)
n_test = int(0.2*n)
n_train = n - (n_val + n_test)

idx = np.arange(n)

# Shuffle the initial dataset, use seed seed_no.
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values


# Apply the log transformation to the median_house_value variable using the np.log1p() function.
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

# Make sure that the target value ('median_house_value') is not in your dataframe.
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Train model
X_train = df_train[base].fillna(0).values

w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)

# Val
X_test = df_test[base].fillna(0).values

y_pred = w0 + X_test.dot(w)

print(rmse(y_test, y_pred))


0.4133724053375516
