In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# License: MIT

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
mat = np.column_stack((X, y))
df = pd.DataFrame(mat, columns=np.append(feature_names, 'MedianValue'))
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianValue
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()

In [6]:
train_score = reg.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 5))

test_score = reg.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 5))

R2 score on the training set: 0.6089
R2 score on the test set: 0.59432


Discretization

In [7]:
from sklearn.preprocessing import KBinsDiscretizer

encoder = KBinsDiscretizer(n_bins=10, encode='onehot-dense')

In [8]:
longitude_bins = encoder.fit_transform(df[['Longitude']])
print(longitude_bins)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
longitude_labels = [f'Longitude{i}' for i in range(10)]
longitude_df = pd.DataFrame(longitude_bins, 
                            columns=longitude_labels)
df2 = pd.concat([df, longitude_df], axis=1)
df2.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianValue,Longitude0,Longitude1,Longitude2,Longitude3,Longitude4,Longitude5,Longitude6,Longitude7,Longitude8,Longitude9
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
latitude_bins = encoder.fit_transform(df[['Latitude']])
latitude_labels = [f'Latitude{i}' for i in range(10)]
latitude_df = pd.DataFrame(latitude_bins, 
                           columns=latitude_labels)
df3 = pd.concat([df2, latitude_df], axis=1)
df3.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianValue,Longitude0,...,Latitude0,Latitude1,Latitude2,Latitude3,Latitude4,Latitude5,Latitude6,Latitude7,Latitude8,Latitude9
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
df3 = df3.drop(['Longitude', 'Latitude'], axis=1)
df3.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedianValue,Longitude0,Longitude1,Longitude2,...,Latitude0,Latitude1,Latitude2,Latitude3,Latitude4,Latitude5,Latitude6,Latitude7,Latitude8,Latitude9
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
X = df3.drop('MedianValue', axis=1)
y = df3['MedianValue']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
reg.fit(X_train, y_train)

LinearRegression()

In [14]:
train_score = reg.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 5))

test_score = reg.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 5))

R2 score on the training set: 0.6405
R2 score on the test set: 0.60771
