In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Preparing the data

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('/kaggle/input/boston-house-prices/housing.csv', header=None, delimiter=r"\s+", names=column_names)
data.head()

In [None]:
data.describe()

Plotting the data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
fig, axs = plt.subplots(nrows=2, ncols=7, figsize=(20,10))
index = 0
axs = axs.flatten()
for k,v in data.items():
    sns.boxplot(y=k, data=data, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
data.shape

Removing data that has MEDV values greater than or equal to 50

In [None]:
data = data[~(data['MEDV'] >= 50.0)]

In [None]:
data.shape

Finding correlations among all the features

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(data.corr().abs(),  annot=True)

Finding correlations with the MEDV

In [None]:
corr_matrix = data.corr()
corr_matrix["MEDV"].sort_values(ascending=False)

Picking up the most relevant features

In [None]:
column_sets = ['LSTAT', 'INDUS', 'TAX', 'NOX', 'PTRATIO', 'AGE', 'RAD', 'RM' ]
x = data.loc[:,column_sets]
y = data['MEDV']

Data Preprocessing
Scaling the data

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x = pd.DataFrame(min_max_scaler.fit_transform(x), columns=column_sets)
x.head()

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k in column_sets:
        sns.distplot(x[k], ax=axs[index])
        index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

Handling the skewed data

In [None]:
for k in column_sets:
    if (x[k].skew()) > 0.5 or (x[k].skew()) < -0.5:
        print(k,x[k].skew(),'------------> highly skewed')
    else:
        print(k,x[k].skew())

In [None]:
for k in column_sets:
    if (x[k].skew()) > 0.5 or (x[k].skew()) < -0.5:
        x[k] = np.log1p(x[k])

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k in column_sets:
        sns.distplot(x[k], ax=axs[index])
        index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sets):
    sns.scatterplot(y=y, x=x[k], ax=axs[i])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

Applying the machine learning algorithms

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
kf = KFold(n_splits=10)
scores_map = {}

 LinearRegression

In [None]:
model1 = LinearRegression()
scores = cross_val_score(model1, x, y ,cv=kf, scoring = "neg_mean_squared_error")
scores_map['LinearRegression'] = scores
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Ridge Regression

In [None]:
l_ridge = Ridge()
scores = cross_val_score(l_ridge, x, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['Ridge'] = scores
rid_rmse_scores = np.sqrt(-scores)
display_scores(rid_rmse_scores)

Lasso Regression

In [None]:
l_ridge = Lasso()
scores = cross_val_score(l_ridge, x, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['Lasso'] = scores
las_rmse_scores = np.sqrt(-scores)
display_scores(las_rmse_scores)

Polynomial Regression

In [None]:
poly_regs= PolynomialFeatures(degree= 2)  
x_poly= poly_regs.fit_transform(x)  
poly =LinearRegression() 
scores = cross_val_score(poly, x_poly, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['poly'] = scores
poly_rmse_scores = np.sqrt(-scores)
display_scores(poly_rmse_scores)

KNeighbors Regressor

In [None]:
model4 = KNeighborsRegressor()
grid = {'n_neighbors' : [3,5,7,9,10]}
grid_knn = GridSearchCV(model4, grid, cv = kf, scoring='neg_mean_squared_error')
results = grid_knn.fit(x,y)
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
knn = KNeighborsRegressor(n_neighbors=9)
scores = cross_val_score(knn, x, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['knn'] = scores
knn_rmse_scores = np.sqrt(-scores)
display_scores(knn_rmse_scores)

Support vector Regressor

In [None]:
svr = SVR()
grid_sv = GridSearchCV(svr, cv=kf, param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, scoring='neg_mean_squared_error')
results = grid_sv.fit(x,y)
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


In [None]:
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
scores = cross_val_score(svr_rbf, x, y, cv=kf, scoring='neg_mean_squared_error')
scores_map['SVR'] = scores
svr_rmse_scores = np.sqrt(-scores)
display_scores(svr_rmse_scores)

Comparing the scores obtained by different models

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('KNN', KNeighborsRegressor(n_neighbors=9)))
models.append(('TREE', DecisionTreeRegressor(max_depth=5)))
models.append(('SVM', SVR(kernel='rbf', C=1e3, gamma=0.1)))
models.append(('GRAD', GradientBoostingRegressor(alpha=0.9,learning_rate=0.05, max_depth=2, min_samples_leaf=5, min_samples_split=2, n_estimators=100, random_state=30)))
# evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_squared_error'
for name, model in models:
    
        kfold = KFold(n_splits=10)
        cv_results = cross_val_score(model, x, y, cv=kfold, scoring=scoring)
        rmse_scores = np.sqrt(-cv_results)
        results.append(rmse_scores)
        names.append(name)
        msg = "%s: %f +- (%f)" % (name, rmse_scores.mean(), rmse_scores.std())
        print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Comparing the model prediction by different models.

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('KNN', KNeighborsRegressor(n_neighbors=9)))
models.append(('TREE', DecisionTreeRegressor(max_depth=5)))
models.append(('SVM', SVR(kernel='rbf', C=1e3, gamma=0.1)))
models.append(('GRAD', GradientBoostingRegressor(alpha=0.9,learning_rate=0.05, max_depth=2, min_samples_leaf=5, min_samples_split=2, n_estimators=100, random_state=30)))
models.append(('LR1', LinearRegression()))
# evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_squared_error'
for name, model in models:
        
        kfold = KFold(n_splits=10)
        predicted = cross_val_predict(model, x, y, cv=kfold)
        fig, ax = plt.subplots()
        ax.scatter(y, predicted, edgecolors=(0, 0, 0))
        ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
        ax.set_title(name)
        ax.set_xlabel('Measured')
        ax.set_ylabel('Predicted')
        plt.show()
