In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = fetch_california_housing()

In [3]:
housing_data = pd.DataFrame(df.data, columns=df.feature_names)
y=df.target

In [4]:
housing_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [6]:
print(df["DESCR"])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [7]:
housing_data.shape

(20640, 8)

In [8]:
housing_data.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25


In [9]:
housing_target=pd.DataFrame(y, columns=['MedHouseVal'])
housing_data_new=housing_data.copy()

In [10]:
housing_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [11]:
housing_data_new["MedHouseVal"]=housing_target

In [12]:
housing_data_new

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [13]:
correlation_matrix = housing_data_new.corr()
corr=correlation_matrix['MedHouseVal'].sort_values(ascending=False)
corr

MedHouseVal    1.000000
MedInc         0.688075
AveRooms       0.151948
HouseAge       0.105623
AveOccup      -0.023737
Population    -0.024650
Longitude     -0.045967
AveBedrms     -0.046701
Latitude      -0.144160
Name: MedHouseVal, dtype: float64

In [14]:
housing_data.drop(columns=['Longitude', 'Latitude'], inplace=True)

In [15]:
(housing_data_new['MedHouseVal']==0).value_counts()

False    20640
Name: MedHouseVal, dtype: int64

# Linear Regression

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(housing_data, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
np.sum(y==0)

0

In [17]:
from sklearn.linear_model import LinearRegression, SGDRegressor
lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred=lin_reg.predict(X_test)

In [19]:
np.any(y_pred < 0)

True

In [20]:
np.sum(y_pred<0)

2

In [21]:
np.sum(y_test==0)

0

In [22]:
zer=np.where(y_test==0)

In [23]:
neg=np.where(y_pred<0)

In [24]:
y_pred = np.delete(y_pred, neg)
y_test = np.delete(y_test, neg)
X_test = np.delete(X_test, neg, axis=0)

print("Number of negative values removed:", len(neg[0]))
print("Shape of filtered y_pred:", y_pred.shape)
print("Shape of filtered y_test:", y_test.shape)
print("Shape of filtered X_test:", X_test.shape)

Number of negative values removed: 2
Shape of filtered y_pred: (4126,)
Shape of filtered y_test: (4126,)
Shape of filtered X_test: (4126, 6)


In [25]:
y_pred_log=np.log(y_pred)
y_test_log=np.log(y_test)

In [26]:
lin_mse=mean_squared_error(y_test_log, y_pred_log)
lin_rmse=np.sqrt(lin_mse)
r2 = r2_score(y_test, y_pred)
print(lin_rmse, r2)

0.4054962136852995 0.5111486631856645


# SGD Regressor

In [None]:
sg_reg=SGDRegressor(max_iter=1000)

In [None]:
sg_reg.fit(X_train, y_train)

In [None]:
y_pred_sg=sg_reg.predict(X_test)
y_pred_sg=np.log(y_pred_sg)
sg_reg_mse=mean_squared_error(y_test, y_pred_sg)
sg_reg_rmse=np.sqrt(sg_reg_mse)
sg_reg_r2 = r2_score(y_test, y_pred_sg)
print(sg_reg_rmse, sg_reg_r2)

# Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet

elastic_reg=ElasticNet(alpha=0.01, l1_ratio=0.3, random_state=42)
elastic_reg.fit(X_train, y_train)

In [None]:
y_pred_elastic=elastic_reg.predict(X_test)
y_pred_elastic=np.log(y_pred_elastic)
elastic_reg_mse=mean_squared_error(y_test, y_pred_elastic)
elastic_reg_rmse=np.sqrt(elastic_reg_mse)
elastic_reg_r2 = r2_score(y_test, y_pred_elastic)
print(elastic_reg_rmse, elastic_reg_r2)

# Random Forest Regressor

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf_reg=RandomForestRegressor(n_estimators=150, max_depth=10, max_features=3)
rf_reg.fit(X_train, y_train)

In [29]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_rf=rf_reg.predict(X_test)
y_predlog_rf=np.log(y_pred_rf)
y_test_log=np.log(y_test)
rf_reg_mse=mean_squared_error(y_test_log, y_predlog_rf)
rf_reg_rmse=np.sqrt(rf_reg_mse)
rf_reg_r2 = r2_score(y_test, y_pred_rf)
print(rf_reg_rmse, rf_reg_r2)

0.3426918198959394 0.6793713049828287


In [36]:
from sklearn.model_selection import cross_val_score

scores=cross_val_score(rf_reg, X_test, y_test,
                      scoring="neg_mean_squared_log_error", cv=10)
print(-scores)

[0.04455682 0.04156586 0.0438427  0.05514847 0.04421769 0.03790452
 0.04622447 0.05088704 0.04435689 0.04068664]


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid=[{"n_estimators": [10, 30, 50, 70, 100], 
             'max_depth': [2, 4, 6, 8, 10], 
             'max_features':[2, 3, 4, 5, 6]}]

forest_reg=RandomForestRegressor()

grid_search=GridSearchCV(forest_reg, 
                         param_grid, 
                         cv=5, 
                         scoring="neg_mean_squared_log_error", 
                         return_train_score=True)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [41]:
from sklearn.model_selection import GridSearchCV
param_grid_2=[{"n_estimators": [100, 150, 200], 
             'max_depth': [10, 12, 15]}]

forest_reg=RandomForestRegressor()

grid_search=GridSearchCV(forest_reg, 
                         param_grid_2, 
                         cv=5, 
                         scoring="neg_mean_squared_log_error", 
                         return_train_score=True)

grid_search.fit(X_train, y_train)

In [42]:
grid_search.best_params_

{'max_depth': 12, 'n_estimators': 200}

In [None]:
import sklearn
sklearn.metrics.get_scorer_names()