Source: https://github.com/mwaskom/seaborn-data (see https://en.wikipedia.org/wiki/Anscombe%27s_quartet)

1. longitude: A measure of how far west a house is; a higher value is farther west

2. latitude: A measure of how far north a house is; a higher value is farther north

3. housingMedianAge: Median age of a house within a block; a lower number is a newer building

4. totalRooms: Total number of rooms within a block

5. totalBedrooms: Total number of bedrooms within a block

6. population: Total number of people residing within a block

7. households: Total number of households, a group of people residing within a home unit, for a block

8. medianIncome: Median income for households within a block of houses (measured in tens of thousands of US Dollars)

9. medianHouseValue: Median house value for households within a block (measured in US Dollars)

10. oceanProximity: Location of the house w.r.t ocean/sea

In [None]:
!pip install --extra-index-url https://pypi-nightly.tensorflow.org/simple tensorflow-data-validation

Looking in indexes: https://pypi.org/simple, https://pypi-nightly.tensorflow.org/simple
Collecting tensorflow-data-validation
  Downloading tensorflow_data_validation-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
Collecting apache-beam[gcp]<3,>=2.47 (from tensorflow-data-validation)
  Downloading apache_beam-2.52.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow<11,>=10 (from tensorflow-data-validation)
  Downloading pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.9/35.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyfarmhash<0.4,>=0.2.2 (from tensorflow-data-

In [None]:
# import tensorflow as tf
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split


from tensorflow_metadata.proto.v0 import schema_pb2

# print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

Tensorflow Version: 2.15.0


In [None]:
import tensorflow_data_validation as tfdv

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('/content/sample_data/california_housing_train.csv')
val_df = pd.read_csv('/content/sample_data/california_housing_test.csv')

## Data Analysis

In [None]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [None]:
train_stats = tfdv.generate_statistics_from_dataframe(train_df)
tfdv.visualize_statistics(train_stats)

In [None]:
X_train = train_df.drop('median_house_value', axis = 1)
y_train = train_df['median_house_value']

Normalizing `longitude`, `latitude`, `total_rooms`, `total_bedrooms`, `population`, `households`

In [None]:
import pandas as pd
from sklearn import preprocessing

from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

column_names_to_normalize = ['longitude', 'latitude', 'total_rooms', 'total_bedrooms', 'population', 'households']
x = X_train[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
normalized_features = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = X_train.index)
X_train[column_names_to_normalize] = normalized_features

Standardizing `housing_median_age`, `latitude`, `total_rooms`, `total_bedrooms`, `population`, `households`

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

column_names_to_standardize = ['housing_median_age', 'median_income']
x = X_train[column_names_to_standardize].values
x_scaled = standard_scaler.fit_transform(x)

standardized_features = pd.DataFrame(x_scaled, columns=column_names_to_standardize, index = X_train.index)
X_train[column_names_to_standardize] = standardized_features

In [None]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,1.000000,0.175345,15.0,0.147885,0.198945,0.028364,0.077454,1.4936
1,0.984064,0.197662,19.0,0.201608,0.294848,0.031559,0.075974,1.8200
2,0.975100,0.122210,17.0,0.018927,0.026847,0.009249,0.019076,1.6509
3,0.974104,0.116897,14.0,0.039515,0.052142,0.014350,0.037000,3.1917
4,0.974104,0.109458,20.0,0.038276,0.050435,0.017405,0.042921,1.9250
...,...,...,...,...,...,...,...,...
16995,0.008964,0.854410,52.0,0.058389,0.060987,0.025337,0.060516,2.3571
16996,0.007968,0.866100,36.0,0.061869,0.081782,0.033381,0.076303,2.5179
16997,0.004980,0.988310,17.0,0.070515,0.082247,0.034782,0.074823,3.0313
16998,0.004980,0.984060,19.0,0.070384,0.085506,0.036296,0.078441,1.9797


In [None]:
X_train.shape

(17000, 9)

In [None]:
train_stats = tfdv.generate_statistics_from_dataframe(X_train)
# tfdv.visualize_statistics(train_stats)

In [None]:
import numpy as np
m, n = X_train.shape
ones = np.ones_like((m, 1))
ones.shape

(2,)

In [None]:
m

17000

In [None]:
import numpy as np
m, n = X_train.shape
ones = np.ones((m, 1))
X_train = np.concatenate((X_train, ones), axis=1)


In [None]:
# X: (17000, 9)
# W: (9, 1)

In [None]:
X_train.shape

(17000, 9)

In [None]:
theta = np.random.normal(size=X_train.shape[1]).reshape((X_train.shape[1], 1))

X_train.dot(theta).shape

y_train = y_train.values.reshape(-1, 1)

In [None]:
theta

array([[-0.49271283],
       [ 0.44551726],
       [-0.54568178],
       [ 2.01505048],
       [ 0.72854352],
       [ 0.30179488],
       [-0.8293782 ],
       [ 0.5635002 ],
       [-0.72414667]])

In [None]:
learning_rate = 0.5

def hypo(X, theta):
  return X.dot(theta)

def gradient(X, y, theta, m):
  gradient = 1/m * X.T.dot(hypo(X, theta) - y) # (2 x 1)
  return gradient

def loss(X, y, theta, m):
  return 1/(2 *m)* np.sum((hypo(X, theta) - y)**2)

epochs = 20000


for i in range(epochs):
  grad = gradient(X_train, y_train, theta, m)
  theta = theta - learning_rate * grad
  if (i+1) % 1000 == 0:
    print('Loss is: {}'.format(loss(X_train, y_train, theta, m)))


  theta = theta - learning_rate * grad


Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan
Loss is: nan


In [None]:
theta

array([[ -433283.21256995],
       [ -402839.40129082],
       [   14568.78393649],
       [ -299138.47876517],
       [  685196.15234926],
       [-1149864.22961332],
       [  231011.40093028],
       [   77073.03690252],
       [  536557.22731731]])

In [21]:
from sklearn.linear_model import LinearRegression
linearRegressor = LinearRegression().fit(X_train,y_train)
linearRegressor

In [22]:
linearRegressor.coef_

array([[-4.33121958e+05, -4.03930584e+05,  1.15069493e+03,
        -3.17828960e+05,  7.58127210e+05, -1.37324090e+06,
         2.76296332e+05,  4.05070684e+04,  0.00000000e+00]])