<a href="https://colab.research.google.com/github/provincit/colab_public/blob/main/boston_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np, pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:


br = '\n'
boston = load_boston()
X = boston.data
y = boston.target
print ('feature shape', X.shape)
print ('target shape', y.shape, br)
keys = boston.keys()
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X, y)
features = boston.feature_names
feature_importances = rfr.feature_importances_
importance = sorted(zip(feature_importances, features), reverse=True)

[print (row) for row in importance]
print ()
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X_train, y_train)
rfr_name = rfr.__class__.__name__
y_pred = rfr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print (rfr_name + ' (rmse):', rmse, br)
cols = list(features) + ['target']
data = pd.DataFrame(data=np.c_[X, y], columns=cols)
print ('boston dataset sample:')
print (data[['RM', 'LSTAT', 'DIS', 'CRIM', 'NOX', 'PTRATIO', 'target']].
head(3), br)
print ('data set before removing noise:', data.shape)
noise = data.loc[data['target'] >= 50]
data = data.drop(noise.index)
print ('data set without noise:', data.shape, br)
X = data.loc[:, data.columns != 'target'].values
y = data['target'].values
print ('cleansed feature shape:', X.shape)
print ('cleansed target shape:', y.shape, br)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print (rfr_name + ' (rmse):', rmse)

feature shape (506, 13)
target shape (506,) 

(0.45730362625767507, 'RM')
(0.3500866188568138, 'LSTAT')
(0.06518862820215897, 'DIS')
(0.040989617257000996, 'CRIM')
(0.02024797563034355, 'NOX')
(0.015576365835498521, 'PTRATIO')
(0.015524054184831325, 'TAX')
(0.01176430855604393, 'AGE')
(0.011324966974602937, 'B')
(0.005912139937999769, 'INDUS')
(0.003916064249793194, 'RAD')
(0.0011173446269339181, 'ZN')
(0.001048289430304092, 'CHAS')

RandomForestRegressor (rmse): 4.091149842219918 

boston dataset sample:
      RM  LSTAT     DIS     CRIM    NOX  PTRATIO  target
0  6.575   4.98  4.0900  0.00632  0.538     15.3    24.0
1  6.421   9.14  4.9671  0.02731  0.469     17.8    21.6
2  7.185   4.03  4.9671  0.02729  0.469     17.8    34.7 

data set before removing noise: (506, 14)
data set without noise: (490, 14) 

cleansed feature shape: (490, 13)
cleansed target shape: (490,) 

RandomForestRegressor (rmse): 3.37169151536684


Exploring boston data with regression algorithms

In [3]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [4]:
def get_scores(model, Xtest, ytest):
 y_pred = model.predict(Xtest)
 return np.sqrt(mean_squared_error(ytest, y_pred)), model.__class__.__name__

br = '\n'

boston = load_boston()
X = boston.data
y = boston.target

print ('feature shape', X.shape)
print ('target shape', y.shape, br)
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=0)
print ('rmse:')
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X_train, y_train)
rmse, rfr_name = get_scores(rfr, X_test, y_test)
print (rmse, '(' + rfr_name + ')')
lr = LinearRegression().fit(X_train, y_train)
rmse, lr_name = get_scores(lr, X_test, y_test)
print (rmse, '(' + lr_name + ')')
ridge = Ridge(random_state=0).fit(X_train, y_train)
rmse, ridge_name = get_scores(ridge, X_test, y_test)
print (rmse, '(' + ridge_name + ')')
lasso = Lasso(random_state=0).fit(X_train, y_train)
rmse, lasso_name = get_scores(lasso, X_test, y_test)
print (rmse, '(' + lasso_name + ')')
en = ElasticNet(random_state=0).fit(X_train, y_train)
rmse, en_name = get_scores(en, X_test, y_test)
print (rmse, '(' + en_name + ')')
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
sgdr_std = SGDRegressor(random_state=0, max_iter=1000, tol=0.001)
sgdr_std.fit(X_train_std, y_train)
rmse, sgdr_name = get_scores(sgdr_std, X_test_std, y_test)
print (rmse, '(' + sgdr_name + ' - scaled)')

feature shape (506, 13)
target shape (506,) 

rmse:
4.091149842219918 (RandomForestRegressor)
5.457311159564069 (LinearRegression)
5.523126267867206 (Ridge)
6.052422661571132 (Lasso)
5.993473468736907 (ElasticNet)
5.614093877555352 (SGDRegressor - scaled)
