In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [113]:
data = pd.read_csv("housing.csv")

In [114]:
data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY


In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### Getting the Data Ready

In [116]:
data.dropna(inplace=True)

In [117]:
from sklearn.model_selection import train_test_split 

X = data.drop(['median_house_value'], axis=1)
y = data['median_house_value']

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_data = X_train.join(y_train)

In [119]:
train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] + 1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)

In [120]:
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'], axis = 1)

In [121]:
train_data = train_data.drop(['ISLAND'], axis=1)

In [122]:
train_data['pop_per_hhold'] = train_data['population']/train_data['households']
train_data['rooms_per_pop'] = train_data['total_rooms']/train_data['population']
train_data['bedrooms_per_pop'] = train_data['total_bedrooms']/train_data['population']

### Applying Linear Regression

In [123]:
X_train.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7131,-118.12,34.02,25.0,2655.0,558.0,1466.0,525.0,3.0529,<1H OCEAN
3996,-118.59,34.18,7.0,11853.0,2691.0,4404.0,2447.0,4.2009,<1H OCEAN
13432,-117.43,34.1,34.0,1345.0,265.0,834.0,290.0,3.7011,INLAND
10807,-117.93,33.65,35.0,2133.0,413.0,1473.0,402.0,4.4211,<1H OCEAN
17184,-122.49,37.5,21.0,1209.0,309.0,801.0,259.0,4.5625,NEAR OCEAN


In [124]:
X_train = X_train.drop(['ocean_proximity'], axis=1)

In [125]:
X_matrix = np.array(X_train)

In [126]:
y_train.head(5)

7131     265800.0
3996     271300.0
13432     99500.0
10807    215200.0
17184    500000.0
Name: median_house_value, dtype: float64

In [127]:
y_vec = np.array(y_train)

In [128]:
X_matrix.shape[0]

16346

In [129]:
y_vec.shape[0]

16346

In [130]:
X_transpose = np.transpose(X_matrix)

In [131]:
X_T_X = np.matmul(X_transpose, X_matrix)

In [132]:
X_T_X_inv = np.linalg.inv(X_T_X)

In [133]:
X_T_y = np.matmul(X_transpose, y_vec)

In [134]:
w_vec = np.matmul(X_T_X_inv, X_T_y)

In [135]:
w_vec.shape

(8,)

In [136]:
y_hat_vec = np.matmul(X_matrix, w_vec)

In [137]:
y_hat_vec

array([177403.82117275, 362324.50716556, 211697.61029702, ...,
       188477.53931829, 256924.92095902, 133381.68484731])

In [138]:
y_hat_list = y_hat_vec.tolist()

In [140]:
y_hat_list

[177403.8211727546,
 362324.5071655617,
 211697.61029702492,
 239815.61449151195,
 211926.90120173106,
 264360.78540756577,
 467645.568618143,
 203152.30020716687,
 202049.64908943363,
 351392.0106890285,
 115866.37982877356,
 108478.70863800612,
 260633.29192324556,
 210955.01159845633,
 253400.95205289673,
 178995.74914569667,
 187801.22839657738,
 297199.9269281045,
 113861.54790020586,
 91201.36513939651,
 97402.8998013918,
 204741.56834515926,
 142756.8438968041,
 222394.7573219989,
 153371.20596312566,
 100199.50612685637,
 203413.66137230952,
 106863.54730263607,
 228000.07466849993,
 147153.9972663169,
 119092.96382584989,
 181527.79271144053,
 501192.40785861446,
 220946.37441762065,
 175872.04876252846,
 211619.8614816918,
 80462.31605063287,
 82956.61430816677,
 168269.00058926683,
 261429.94203747058,
 273444.23629641556,
 80581.1255672903,
 216042.09647975044,
 155243.3000280255,
 160531.0352146396,
 348039.1979857667,
 166120.57757272493,
 347764.4449965265,
 277507.02627