#Load the Dataset

In [None]:
# Import the usual packages
import pandas as pd
import numpy as np

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#Load all the data files
data_path = '/content/drive/MyDrive/SRH'

In [None]:
df = pd.read_csv(data_path +'/housing.csv')

In [None]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
df.dropna(inplace=True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Linear regression

y = df['median_house_value']
X = df[['households','longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','median_income','ocean_proximity']]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.head()

Unnamed: 0,households,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,median_income,ocean_proximity
20615,197.0,-121.54,39.08,23.0,1076.0,216.0,724.0,2.3598,INLAND
5543,441.0,-118.4,33.98,36.0,2526.0,452.0,996.0,5.611,<1H OCEAN
17,303.0,-122.27,37.85,52.0,1228.0,293.0,648.0,2.1202,NEAR BAY
14187,389.0,-117.06,32.71,21.0,1864.0,388.0,1498.0,3.8194,NEAR OCEAN
4328,775.0,-118.33,34.08,50.0,2989.0,832.0,1345.0,3.2426,<1H OCEAN


#Feature Engineering

Feature Engineering
Feature Preprocessing implies updating or transforming the existing features.

Feature Generation is creating new features from the existing features

In [None]:
#Create a Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

my_column_transformer = ColumnTransformer(transformers = [
    ('household_scaler',    StandardScaler(),            ['households']),
    ('population_scaler',    StandardScaler(),            ['population']),
    ('long_scaler',    StandardScaler(),            ['longitude']),
    ('lat_scaler',    StandardScaler(),            ['latitude']),
    ('housing_median_age_scaler',    StandardScaler(),            ['housing_median_age']),
    ('total_rooms_scaler',    StandardScaler(),            ['total_rooms']),
    ('total_bedrooms_scaler',    StandardScaler(),            ['total_bedrooms']),
    ('median_income_scaler',    StandardScaler(),            ['median_income']),
    ('ohe_ocean',    OneHotEncoder(drop='first'), ['ocean_proximity'])
               ],
   remainder = 'passthrough'
)



In [None]:
#Transform and Fit the data
Xtrain_fe = my_column_transformer.fit_transform(X_train)
Xtrain_fe = pd.DataFrame(Xtrain_fe, columns = my_column_transformer.get_feature_names_out())
Xtrain_fe.shape
Xtest_fe = my_column_transformer.transform(X_test)
Xtest_fe.shape

(4087, 12)

In [None]:
Xtrain_fe.shape, y_train.shape, Xtest_fe.shape, y_test.shape


((16346, 12), (16346,), (4087, 12), (4087,))

In [None]:
Xtrain_fe.head()

Unnamed: 0,household_scaler__households,population_scaler__population,long_scaler__longitude,lat_scaler__latitude,housing_median_age_scaler__housing_median_age,total_rooms_scaler__total_rooms,total_bedrooms_scaler__total_bedrooms,median_income_scaler__median_income,ohe_ocean__ocean_proximity_INLAND,ohe_ocean__ocean_proximity_ISLAND,ohe_ocean__ocean_proximity_NEAR BAY,ohe_ocean__ocean_proximity_NEAR OCEAN
0,-0.802698,-0.643045,-0.988351,1.622621,-0.448861,-0.722659,-0.774001,-0.794986,1.0,0.0,0.0,0.0
1,-0.152759,-0.392085,0.58287,-0.772534,0.58424,-0.047372,-0.20409,0.917428,0.0,0.0,0.0,0.0
2,-0.520348,-0.713166,-1.353635,1.044966,1.855749,-0.651871,-0.588055,-0.921184,0.0,0.0,1.0,0.0
3,-0.291271,0.071084,1.253392,-1.368975,-0.6078,-0.355676,-0.358642,-0.026211,0.0,0.0,0.0,1.0
4,0.736911,-0.070081,0.617898,-0.725571,1.696811,0.168254,0.713564,-0.330013,0.0,0.0,0.0,0.0


#Build a Linear Regression Model

In [None]:
# Import the relevant sklearn packages
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
lm = LinearRegression()
lm.fit(Xtrain_fe,y_train)

#X_train, X_test, y_train, y_test

In [None]:
#Make Predictions from the Model
ypred_lm = lm.predict(Xtest_fe)

In [None]:
print("train score :", lm.score(Xtrain_fe, y_train))
print("test score  :", lm.score(Xtest_fe, y_test))
lm_r2_score = r2_score(y_test, ypred_lm)
print(lm_r2_score)

train score : 0.6505601883434642
test score  : 0.6276871122376422
0.6276871122376422


In [None]:
print("Root Mean Squared Error:", mean_squared_error(y_test, ypred_lm,squared=False))

Root Mean Squared Error: 70040.25190846891


In [None]:
from sklearn.ensemble import RandomForestRegressor
rm = RandomForestRegressor()
rm.fit(Xtrain_fe,y_train)

In [None]:
#Make Predictions from the Model
ypred_rm = rm.predict(Xtest_fe)

In [None]:
rm_r2_score = r2_score(y_test, ypred_rm)
print("R2 Score:", rm_r2_score)

R2 Score: 0.8223188788913299


In [None]:
print("Root Mean Squared Error:", mean_squared_error(y_test, ypred_rm,squared=False))

Root Mean Squared Error: 48385.39685100488
