In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from keras.models import Model, load_model
from keras.layers import Dense, BatchNormalization, Activation, Input
from keras.activations import relu
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.utils import plot_model
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.size'] = 12
sns.set_palette('Pastel1')

In [None]:
data_path = '../input/boston-housing-dataset/HousingData.csv'
random_state = 42
default_scoring = 'neg_root_mean_squared_error'

##### Description 
1. CRIM - per capita crime rate by town
2. ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
3. INDUS - proportion of non-retail business acres per town.
4. CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5. NOX - nitric oxides concentration (parts per 10 million)
6. RM - average number of rooms per dwelling
7. AGE - proportion of owner-occupied units built prior to 1940
8. DIS - weighted distances to five Boston employment centres
9. RAD - index of accessibility to radial highways
10. TAX - full-value property-tax rate per 10,000 USD
11. PTRATIO - pupil-teacher ratio by town
12. B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. LSTAT - % lower status of the population
14. MEDV - Median value of owner-occupied homes in USD 1000's

** target to be predicted is `MEDV`


In [None]:
# Import data
data = pd.read_csv(data_path)
data.head()

In [None]:
# data analysis & visualization
data.info() # the data contains 506 rows, 14 columns in total

In [None]:
# categorical columns
data.CHAS = data.CHAS.astype('string')
data.RAD = data.RAD.astype('category')

In [None]:
data.describe() # summary statistics

In [None]:
# separate features and target
TARGET_COL = 'MEDV'
TARGET = data[TARGET_COL]
FEATURES = data.drop(columns=[TARGET_COL])

In [None]:
num_cols = FEATURES.select_dtypes(exclude=['category', 'string'])
cat_cols = FEATURES.select_dtypes(include=['category', 'string'])
for i in num_cols.columns:
    num_cols[[i]].plot.kde() # plot distribution of numeric features
    plt.title(i)
    plt.show()

In [None]:
# correlation analysis
sns.set_theme(style="white")

corr = num_cols.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(12, 6))

sns.heatmap(corr, mask=mask, cmap='OrRd', ax=ax, annot=True, fmt='.2f', linewidths=.5, center=0.75, vmin=-1, vmax=1)

ax.set_title('Feature correlation plot')
plt.xticks(rotation=45)
plt.show()

In [None]:
'''
select top 5 features most correlated with target

house price is lower for areas with higher % of lower status population
'''

corr_wtarget = data.corr()[TARGET_COL]
df_corr = pd.DataFrame({'magnitude': np.abs(corr_wtarget)})
df_corr['direction'] = corr_wtarget.apply(lambda x: 'positive' if x > 0 else 'negative')
df_corr.sort_values('magnitude', ascending=False).head(6)[1:] 

In [None]:
# areas with higher crime rate per capita in Boston have lower number of blacks in average
copy = FEATURES.copy()
copy['CRIM_BIN'] = pd.qcut(copy.CRIM, q=5)
copy.groupby('CRIM_BIN')[['CRIM_BIN','B']].mean().reset_index()

In [None]:
# houses tend to be more cheap in Boston when they are not near river 
data.groupby('CHAS').MEDV.agg([min]).reset_index()

In [None]:
# count plot
for c in cat_cols.columns:
    df = pd.DataFrame(data.groupby(c).count()).reset_index()
    sns.barplot(x=df.iloc[:,0],y=df.iloc[:,1], palette='Paired')
    plt.show()

In [None]:
data.isna().sum() # missing values count in each column

In [None]:
# fill missing values
na_cols = ['CRIM', 'ZN', 'INDUS', 'AGE', 'LSTAT']
for c in na_cols:
    FEATURES[c] = FEATURES[c].fillna(FEATURES[c].median())
FEATURES['CHAS'] = FEATURES['CHAS'].fillna('2.0') # 2 = unknown CHAS

In [None]:
FEATURES.isna().sum() # no missing values left

In [None]:
# data splitting
train_X, test_X, y_train, y_test = train_test_split(FEATURES, TARGET,random_state=random_state)

In [None]:
# preprocessing
scaler = StandardScaler()
train =  pd.DataFrame(scaler.fit_transform(train_X[num_cols.columns]))
test = pd.DataFrame(scaler.transform(test_X[num_cols.columns]))
cat_train = pd.get_dummies(train_X[cat_cols.columns]).reset_index(drop=True)
cat_test = pd.get_dummies(test_X[cat_cols.columns]).reset_index(drop=True)
X_train = pd.concat([train, cat_train], axis=1)
X_test = pd.concat([test, cat_test], axis=1)

In [None]:
# reset columns
X_train.columns = list(num_cols.columns) + list(cat_train.columns)
X_test.columns = list(num_cols.columns) + list(cat_test.columns)

In [None]:
'''
help do cross validations of different algorithms returning the average scores 
and printing the results
'''
def getScores(pipelines, X, y, cv=5, scoring=default_scoring):
    scores = []
    for k, (name, pipe) in pipelines.items():
        score = cross_val_score(pipe, X, y, cv=cv, scoring=scoring)
        avg_score = np.mean(score)
        print()
        print(f"Algorithm : {name}")
        print(f"Average validation {scoring}: {avg_score}")
        print(f"Maximum validation {scoring}:  {np.max(score)}")
        scores.append([name, avg_score])
    return scores

In [None]:
# combine models so that we can loop through them
pipelines = {
    'lr': ('linear regression', LinearRegression()),
    'lasso': ('lasso regression', Lasso()),
    'rd': ('ridge regression', Ridge()),
    'knn': ('knn regression', KNeighborsRegressor()),
    'svr': ('support vector regression', SVR()),
    'dt': ('decesion tree regression', DecisionTreeRegressor()),
    'rf': ('random forest regression', RandomForestRegressor()),
    'et': ('extra tree regression', ExtraTreeRegressor()),
    'gb': ('gradient boosting regression', GradientBoostingRegressor()),
    'bg': ('bagging regression', BaggingRegressor()),
    'adb': ('adaboost regression', AdaBoostRegressor()),
}

In [None]:
scores = getScores(pipelines, X_train, y_train)

In [None]:
# display results of cross validation sorted by score descendingly
def show_scores(scores):
    return pd.DataFrame(scores, columns=['algorithm', 'score']).sort_values('score', ascending=False).reset_index(drop=True)

In [None]:
show_scores(scores)

In [None]:
# help do hyper parameter tuning
def doTuneCV(pipeline, X, y, space, cv=5,  scoring=default_scoring):
    search = GridSearchCV(pipeline, space, cv=cv, scoring=scoring)
    result = search.fit(X, y)
    print(f"Best params: {result.best_params_}")
    print(f"Best validation {scoring} : {result.best_score_}")
    return result

In [None]:
# search space of hyper parameters
space = dict()
space['n_estimators'] = range(50, 150, 10)
space['max_features'] = ['auto', 'sqrt', 'log2']

In [None]:
GB_tuned = doTuneCV(GradientBoostingRegressor(), X_train, y_train, space)

In [None]:
# plot feature importance
fig = plt.figure(figsize=(10,8))
sns.barplot(x=GB_tuned.best_estimator_.feature_importances_, y=X_train.columns, palette='Pastel1')
plt.show()

In [None]:
def evaluate(model, best_params, X_train, y_train, X_test, y_test):
    model.set_params(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"performance on training set: {mean_squared_error(y_train, model.predict(X_train), squared=False)}")
    print(f"mse on unseen test set: {mean_squared_error(y_test, y_pred, squared=False)}")
    print(f"r2 unseen test set: {r2_score(y_test, y_pred)}")
    return model

In [None]:
best_model = evaluate(GradientBoostingRegressor(), GB_tuned.best_params_, X_train, y_train, X_test, y_test)

In [None]:
# plot mae distribution
def plot_error_dist(model, X_test, y_test):
    errors = []
    y_pred = model.predict(X_test)
    for (yt, ypred) in zip(y_test, y_pred):
        errors.append(np.abs(yt - ypred))
    fig, ax = plt.subplots(figsize=(4,3))
    _ = sns.histplot(errors, alpha=0.5, ax=ax, color='#ff8400')
    plt.xlabel('error')
    plt.ylabel('count')
    plt.show()

In [None]:
# most of our errors has low value
plot_error_dist(best_model, X_test, y_test)

In [None]:
reg = VotingRegressor([
    ('knn', KNeighborsRegressor()),
    ('ridge', Ridge()),
    ('bag', GradientBoostingRegressor(n_estimators=100)),
    ('extra', ExtraTreeRegressor())
    
])
# cross validated score of stacked classifier
np.mean(cross_val_score(reg, X_train, y_train, cv=5, scoring=default_scoring))

In [None]:
stacked_model = evaluate(reg, {}, X_train, y_train, X_test, y_test)

In [None]:
plot_error_dist(stacked_model, X_test, y_test)

In [None]:
# quantile transform some columns to have gaussian distribution
copy_train = X_train.copy()
copy_test = X_test.copy()

copy_train.drop(columns=['INDUS','AGE'], inplace=True)
copy_test.drop(columns=['INDUS','AGE'], inplace=True)

qt = QuantileTransformer(n_quantiles= 500, output_distribution='normal',
                         random_state=random_state)

st = StandardScaler()
copy_train[['DIS','CRIM','LSTAT']] = st.fit_transform(qt.fit_transform(train_X[['DIS','CRIM','LSTAT']]))
copy_test[['DIS','CRIM','LSTAT']] = st.transform(qt.transform(test_X[['DIS','CRIM','LSTAT']]))

for i in copy_train[['DIS','CRIM','LSTAT']].columns:
    copy_train[[i]].plot.kde()
    plt.title(i)
    plt.show()

In [None]:
Iscores = getScores(pipelines, copy_train, y_train)

In [None]:
show_scores(Iscores)

In [None]:
# search space of hyper parameters
space = dict()
space['n_estimators'] = range(50, 150, 10)
space['max_features'] = ['auto', 'sqrt', 'log2']

IGB_tuned = doTuneCV(GradientBoostingRegressor(), copy_train, y_train, space)

In [None]:
IGB_model = evaluate(GradientBoostingRegressor(), IGB_tuned.best_params_, copy_train, y_train, copy_test, y_test)

In [None]:
# feature importance
fig = plt.figure(figsize=(10,8))
sns.barplot(x=IGB_model.feature_importances_, y=copy_train.columns, palette='Pastel1')
plt.show()

In [None]:
# using deep learning
input_t = Input(shape=(X_train.shape[1],))
dense1 = Dense(28)(input_t)
batch_n = BatchNormalization()(dense1)
act_1 = Activation(relu)(batch_n)
dense2 = Dense(14)(act_1)
batch_2n = BatchNormalization()(dense2)
act_2 = Activation(relu)(batch_2n)
dense3 = Dense(28)(act_2)
batch_3n = BatchNormalization()(dense3)
act_3 = Activation(relu)(batch_3n)
dense4 = Dense(1)(act_3)
batch_4n = BatchNormalization()(dense4)
act_4 = Activation(relu)(batch_4n)
model = Model(input_t,act_4)

model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
model.compile(optimizer=Adam(lr=0.01), loss='mse', metrics=['mse', 'mae'])

In [None]:
# save best model
callbacks = [
    ModelCheckpoint('bh.h5'),
]

In [None]:
history = model.fit(
    X_train.to_numpy().astype('float32'),
    y_train, 
    validation_split=0.1,
    verbose=0, 
    epochs=2000, 
    callbacks=callbacks)

In [None]:
# visualize training results
_ = pd.DataFrame(history.history).plot(figsize=(10,8))

In [None]:
print(f"best validation rmse: {min(history.history['val_loss']) ** (1/2)}")

In [None]:
best_deep_model = load_model('bh.h5')
loss, _, __ = best_deep_model.evaluate(
    X_test.to_numpy().astype('float32'),
    y_test, verbose=2
)
print("\nModel evaluation result")
print(f"rmse on test set: {loss ** (1/2)}")

In [None]:
!pip3 install -q autogluon
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
### Autogluon - automate ML

# create dataset to make it usable with Autogluon
train_df = pd.DataFrame(copy_train)
train_df[TARGET_COL] = y_train.reset_index()[TARGET_COL]

test_df = pd.DataFrame(copy_test)
test_df[TARGET_COL] = y_test.reset_index()[TARGET_COL]

In [None]:
time_limit = 600 # seconds
model_dir = 'bh_models'
predictor = TabularPredictor(TARGET_COL, path=model_dir).fit(train_df, time_limit=time_limit, presets='best_quality')

In [None]:
predictor.leaderboard(test_df, silent=True)