## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the Dataset

In [2]:
df = pd.read_csv(r"C:\Users\Raja Alamsyah\1553768847-housing.csv")

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
X

array([[-122.23, 37.88, 41, ..., 126, 8.3252, 'NEAR BAY'],
       [-122.22, 37.86, 21, ..., 1138, 8.3014, 'NEAR BAY'],
       [-122.24, 37.85, 52, ..., 177, 7.2574, 'NEAR BAY'],
       ...,
       [-121.22, 39.43, 17, ..., 433, 1.7, 'INLAND'],
       [-121.32, 39.43, 18, ..., 349, 1.8672, 'INLAND'],
       [-121.24, 39.37, 16, ..., 530, 2.3886, 'INLAND']], dtype=object)

In [5]:
y

array([452600, 358500, 352100, ...,  92300,  84700,  89400], dtype=int64)

## Missing Data

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 4:5])
X[:, 4:5] = imputer.transform(X[:, 4:5])

In [7]:
print(X)

[[-122.23 37.88 41 ... 126 8.3252 'NEAR BAY']
 [-122.22 37.86 21 ... 1138 8.3014 'NEAR BAY']
 [-122.24 37.85 52 ... 177 7.2574 'NEAR BAY']
 ...
 [-121.22 39.43 17 ... 433 1.7 'INLAND']
 [-121.32 39.43 18 ... 349 1.8672 'INLAND']
 [-121.24 39.37 16 ... 530 2.3886 'INLAND']]


## Categorical Data

**Encoding the Independent Variable**

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), [-1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
X

array([[0.0, 0.0, 0.0, ..., 322, 126, 8.3252],
       [0.0, 0.0, 0.0, ..., 2401, 1138, 8.3014],
       [0.0, 0.0, 0.0, ..., 496, 177, 7.2574],
       ...,
       [0.0, 1.0, 0.0, ..., 1007, 433, 1.7],
       [0.0, 1.0, 0.0, ..., 741, 349, 1.8672],
       [0.0, 1.0, 0.0, ..., 1387, 530, 2.3886]], dtype=object)

## Split Train Set & Test Set

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2 , random_state=0)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16512, 13)
(4128, 13)
(16512,)
(4128,)


## Scaling Data

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
print(X_train.shape)
print(X_test.shape)

(16512, 13)
(4128, 13)


## Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
linreg_model = LinearRegression()

In [18]:
linreg_model.fit(X_train, y_train)

In [19]:
y_pred = linreg_model.predict(X_test)

In [20]:
y_pred_value = pd.DataFrame(y_pred).rename(columns={0:"Predicted House Value"})
y_pred_value

Unnamed: 0,Predicted House Value
0,217715.799943
1,289843.799943
2,179411.799943
3,86771.799943
4,288595.799943
...,...
4123,173779.799943
4124,247635.799943
4125,88947.799943
4126,255251.799943


In [21]:
y_test_value = pd.DataFrame(y_test).rename(columns={0:"Median House Values"})
y_test_value

Unnamed: 0,Median House Values
0,136900
1,241300
2,200700
3,72500
4,460000
...,...
4123,169500
4124,204600
4125,128600
4126,259500


In [22]:
y_pred_value = pd.concat([y_pred_value, y_test_value], axis=1)
y_pred_value

Unnamed: 0,Predicted House Value,Median House Values
0,217715.799943,136900
1,289843.799943,241300
2,179411.799943,200700
3,86771.799943,72500
4,288595.799943,460000
...,...,...
4123,173779.799943,169500
4124,247635.799943,204600
4125,88947.799943,128600
4126,255251.799943,259500


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [24]:
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)

In [25]:
MAE

49851.856480854396

In [26]:
RMSE

68724.39712712051

In [54]:
# Training set Performance

X_pred = linreg_model.predict(X_train)
MAE = mean_absolute_error(y_train, X_pred)
MAE

50071.07199836054

In [55]:
linreg_model.coef_

array([-8.60599114e+17, -8.05871731e+17, -2.33512375e+16, -5.43103546e+17,
       -5.82601609e+17, -5.38178791e+04, -5.45079776e+04,  1.34982296e+04,
       -9.35650956e+03,  2.83043742e+04, -4.42100994e+04,  2.98406162e+04,
        7.39391512e+04])

## Ridge Cross_Validation Regression

In [56]:
from sklearn.linear_model import RidgeCV

In [57]:
ridge_cv_model = RidgeCV(alphas=(0.1, 1.0, 10.0),scoring='neg_mean_absolute_error')

In [58]:
ridge_cv_model.fit(X_train, y_train)

In [59]:
ridge_cv_model.alpha_

10.0

In [60]:
y_pred = ridge_cv_model.predict(X_test)

In [61]:
y_pred_value = pd.DataFrame(y_pred).rename(columns={0:"Predicted House Value"})
y_pred_value

Unnamed: 0,Predicted House Value
0,216766.080379
1,287615.421225
2,179368.477084
3,86832.279188
4,291532.093962
...,...
4123,171637.746198
4124,245400.324634
4125,89599.255254
4126,254742.304134


In [62]:
y_test_value = pd.DataFrame(y_test).rename(columns={0:"Median House Values"})
y_test_value

Unnamed: 0,Median House Values
0,136900
1,241300
2,200700
3,72500
4,460000
...,...
4123,169500
4124,204600
4125,128600
4126,259500


In [63]:
y_pred_value = pd.concat([y_pred_value, y_test_value], axis=1)
y_pred_value

Unnamed: 0,Predicted House Value,Median House Values
0,216766.080379,136900
1,287615.421225,241300
2,179368.477084,200700
3,86832.279188,72500
4,291532.093962,460000
...,...,...
4123,171637.746198,169500
4124,245400.324634,204600
4125,89599.255254,128600
4126,254742.304134,259500


In [64]:
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)

In [65]:
MAE

49690.48875659401

In [66]:
RMSE

68690.08542033791

In [67]:
# Training set Performance

X_pred = ridge_cv_model.predict(X_train)
MAE = mean_absolute_error(y_train, X_pred)
MAE

49902.39030551201

In [68]:
ridge_cv_model.coef_

array([  6063.21915521, -12601.90587264,   2348.06250082,   2518.17775902,
         6033.38057162, -52808.79019094, -53394.99162374,  13530.23387135,
        -9166.90266824,  28132.02636157, -43981.8776743 ,  29704.02328175,
        73834.34775568])

## Lasso Cross-Validation Regression

In [69]:
from sklearn.linear_model import LassoCV

In [70]:
lasso_cv_model = LassoCV(eps=0.1,n_alphas=100,cv=5)

In [71]:
lasso_cv_model.fit(X_train, y_train)

In [72]:
lasso_cv_model.alpha_

8014.383260633453

In [73]:
y_pred = lasso_cv_model.predict(X_test)

In [74]:
y_pred_value = pd.DataFrame(y_pred).rename(columns={0:"Predicted House Value"})
y_pred_value

Unnamed: 0,Predicted House Value
0,235506.347873
1,294530.547017
2,177697.450015
3,115848.433148
4,265646.660799
...,...
4123,182473.710192
4124,248373.196530
4125,132263.915133
4126,234983.967060


In [75]:
y_test_value = pd.DataFrame(y_test).rename(columns={0:"Median House Values"})
y_test_value

Unnamed: 0,Median House Values
0,136900
1,241300
2,200700
3,72500
4,460000
...,...
4123,169500
4124,204600
4125,128600
4126,259500


In [76]:
y_pred_value = pd.concat([y_pred_value, y_test_value], axis=1)
y_pred_value

Unnamed: 0,Predicted House Value,Median House Values
0,235506.347873,136900
1,294530.547017,241300
2,177697.450015,200700
3,115848.433148,72500
4,265646.660799,460000
...,...,...
4123,182473.710192,169500
4124,248373.196530,204600
4125,132263.915133,128600
4126,234983.967060,259500


In [77]:
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)

In [78]:
MAE

55201.14697191569

In [79]:
RMSE

75041.08451873489

In [80]:
# Training set Performance

X_pred = lasso_cv_model.predict(X_train)
MAE = mean_absolute_error(y_train, X_pred)
MAE

54954.975027793545

In [81]:
lasso_cv_model.coef_

array([    -0.        , -31163.06045201,      0.        ,      0.        ,
            0.        ,     -0.        ,     -0.        ,   4460.24713936,
            0.        ,      0.        ,     -0.        ,      0.        ,
        64991.83513701])

## ElasticNet

In [82]:
from sklearn.linear_model import ElasticNetCV

In [83]:
elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7,.9, .95, .99, 1],tol=0.01)

In [84]:
elastic_model.fit(X_train, y_train)

In [85]:
elastic_model.l1_ratio_

1.0

In [86]:
y_pred = elastic_model.predict(X_test)

In [87]:
y_pred_value = pd.DataFrame(y_pred).rename(columns={0:"Predicted House Value"})
y_pred_value

Unnamed: 0,Predicted House Value
0,217119.668059
1,288072.521576
2,179232.560568
3,87603.425170
4,291280.283132
...,...
4123,171580.123081
4124,245539.653510
4125,90346.120994
4126,253951.949175


In [88]:
y_test_value = pd.DataFrame(y_test).rename(columns={0:"Median House Values"})
y_test_value

Unnamed: 0,Median House Values
0,136900
1,241300
2,200700
3,72500
4,460000
...,...
4123,169500
4124,204600
4125,128600
4126,259500


In [89]:
y_pred_value = pd.concat([y_pred_value, y_test_value], axis=1)
y_pred_value

Unnamed: 0,Predicted House Value,Median House Values
0,217119.668059,136900
1,288072.521576,241300
2,179232.560568,200700
3,87603.425170,72500
4,291280.283132,460000
...,...,...
4123,171580.123081,169500
4124,245539.653510,204600
4125,90346.120994,128600
4126,253951.949175,259500


In [90]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)

In [91]:
MAE

49697.286062123036

In [92]:
RMSE

68705.89294503543

In [93]:
# Training set Performace

X_pred = elastic_model.predict(X_train)
MAE = mean_absolute_error(y_train, X_pred)
MAE

49901.57537117464

In [94]:
elastic_model.coef_

array([ 1.62348653e+03, -1.72477849e+04,  2.15819285e+03, -4.76760854e+01,
        3.07339048e+03, -5.11139135e+04, -5.17516211e+04,  1.34808159e+04,
       -7.55088493e+03,  2.63831933e+04, -4.37397082e+04,  2.96361797e+04,
        7.34850793e+04])