In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

In [None]:
df=pd.read_csv("/kaggle/input/boston-housing-dataset/HousingData.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
df.hist(figsize=(8,8),layout=(3,5))
plt.show()

In [None]:
df.plot(kind='box',figsize=(8,8),subplots=True,layout=(3,5))
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
train,val=train_test_split(df)
X,y=train.drop('MEDV',axis=1),train.MEDV
train.shape,val.shape,X.shape,y.shape

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet())) 
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor())) 
models.append(('SVR', SVR()))

# Baseline

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer()
results1 = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('model',model)])
    cv_results = cross_val_score(pipe, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results1.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
df1=pd.DataFrame(data=[result.mean() for result in results1],index=names,columns=['Baseline'])
df1

# QuantileTransformer

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt=QuantileTransformer(n_quantiles=341)
imputer=SimpleImputer()

results2 = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('qt',qt),('model',model)])
    cv_results = cross_val_score(pipe, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results2.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
df2=pd.DataFrame(data=[result.mean() for result in results2],index=names,columns=['QuantileTransformer'])
df2

In [None]:
df3=df1.join(df2)
df3

# SelectKBest - 6

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

imputer=SimpleImputer()
skb = SelectKBest(score_func=f_classif, k=6)

results3 = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('skb',skb),('model',model)])
    cv_results = cross_val_score(pipe, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results3.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
df4=pd.DataFrame(data=[result.mean() for result in results3],index=names,columns=['SelectKBest'])
df4

In [None]:
df5=df3.join(df4)
df5

# Tuned KNN

In [None]:
X=SimpleImputer().fit_transform(X)

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

k_values = np.array([1,3,5,7,9,11,13,15,17,19,21])
param_grid = dict(n_neighbors=k_values)
model = KNeighborsRegressor()
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=kfold)
grid_result = grid.fit(rescaledX, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Ensemble

In [None]:
X=SimpleImputer().fit_transform(X)

ensembles = []
ensembles.append(('AB', AdaBoostRegressor()))
ensembles.append(('GBM', GradientBoostingRegressor()))
ensembles.append(('RF', RandomForestRegressor(n_estimators=10)))
ensembles.append(('ET', ExtraTreesRegressor(n_estimators=10)))

results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=0, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# GradientBoostingRegressor - GridSearchCV

In [None]:

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

X=SimpleImputer().fit_transform(X)



model = GradientBoostingRegressor()
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv)
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Validation 

In [None]:
from sklearn.metrics import mean_absolute_error

imputer = SimpleImputer().fit(X)
X = imputer.transform(X)
model = GradientBoostingRegressor(random_state=0, learning_rate=0.1,max_depth=3, n_estimators=500, subsample=0.7)
model.fit(X, y)

X_val,y_val=val.drop('MEDV',axis=1),val.MEDV

X_val = imputer.transform(X_val)
predictions = model.predict(X_val)
print(mean_absolute_error(y_val, predictions))