In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

Input features in order:
1) CRIM: per capita crime rate by town
2) ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3) INDUS: proportion of non-retail business acres per town
4) CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5) NOX: nitric oxides concentration (parts per 10 million) [parts/10M]
6) RM: average number of rooms per dwelling
7) AGE: proportion of owner-occupied units built prior to 1940
8) DIS: weighted distances to five Boston employment centres
9) RAD: index of accessibility to radial highways
10) TAX: full-value property-tax rate per $10,000 [$/10k]
11) PTRATIO: pupil-teacher ratio by town
12) B: The result of the equation B=1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13) LSTAT: % lower status of the population

Output variable:
1) MEDV: Median value of owner-occupied homes in $1000's [k$]



In [None]:
df=pd.read_csv('/kaggle/input/the-boston-houseprice-data/boston.csv')

In [None]:
df

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
%matplotlib inline

In [None]:
sns.heatmap(df.corr())

# 1. Baseline by LinearRegression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,LassoCV
from sklearn.metrics import mean_squared_error

In [None]:
x=df.drop('MEDV',axis=1)
y=df['MEDV']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.8,random_state=0)

In [None]:
model_lr=LinearRegression()
model_lr.fit(x_train,y_train)
pred_train=model_lr.predict(x_train)
pred_test=model_lr.predict(x_test)
print(np.sqrt(mean_squared_error(y_train, pred_train)))
print(np.sqrt(mean_squared_error(y_test, pred_test)))

# 2.Logarithmic

In [None]:
MEDVS=pd.DataFrame({'medvs':df['MEDV'],'log(medvs+1)':np.log1p(df['MEDV'])})
print(MEDVS, '¥n')

print('medvs skew        :',skew(MEDVS['medvs']))
print('log(medvs+1) skew:', skew(MEDVS['log(medvs+1)']))

MEDVS.hist()

# MEDV skewness is 1.1 and logarithmic transformation　skewness is -0.24. It is getting better!

In [None]:
df['MEDV']=np.log1p(df['MEDV'])

# Find the features which have high (>0.75) skewness.

In [None]:
df1=df.drop('MEDV',axis=1)
df1_skew=df1.apply(lambda x:skew(x))
print(df1_skew)

In [None]:
df1_skew = df1_skew[df1_skew > 0.75]
print('-----Skewness greater than 0.75-----')
print(df1_skew)
df1_skew = df1_skew.index

df1[df1_skew] = np.log1p(df1[df1_skew])
df1[df1_skew]

In [None]:
df1

In [None]:
df1.describe()

# 3. Prediction Model

# 1)LinearRegression

In [None]:
X=df1
Y=df['MEDV']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,train_size=0.8,random_state=0)

In [None]:
model_lr=LinearRegression()

In [None]:
model_lr.fit(X_train,Y_train)
pred_train=model_lr.predict(X_train)
pred_test=model_lr.predict(X_test)
print(np.sqrt(mean_squared_error(Y_train, pred_train)))
print(np.sqrt(mean_squared_error(Y_test, pred_test)))

# 2) Ridge

In [None]:
def rmse_cv(model):
    rmse = np.sqrt(
        -cross_val_score(
            model, X_train, Y_train,
            scoring="neg_mean_squared_error", 
            cv = 5))
    return(rmse)

In [None]:
model_rg = Ridge()

alphas = [0,0.05, 0.1, 0.5]
cv_rg = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]
cv_rg = pd.Series(cv_rg, index = alphas)

print('Ridge RMSE loss:')
print(cv_rg, '\n')

print('Ridge RMSE loss Mean:')
print(cv_rg.mean())


plt.ﬁgure(ﬁgsize=(10, 5))
plt.plot(cv_rg)
plt.grid()
plt.title('Validation - by regularization strength')
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_rg.fit(X_train,Y_train)
pred1=model_rg.predict(X_test)
np.sqrt(mean_squared_error(Y_test, pred1))

# 3) LassoCV

In [None]:
model_ls = LassoCV(
    alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, Y_train)

print('Lasso regression RMSE loss:')
print(rmse_cv(model_ls))

print('Average loss:', rmse_cv(model_ls).mean())
print('Minimum loss:', rmse_cv(model_ls).min())
print('Best alpha  :', model_ls.alpha_) 

In [None]:
model_ls.fit(X_train,Y_train)
pred2=model_ls.predict(X_test)
np.sqrt(mean_squared_error(Y_test, pred2))

# Among Linear Regression, Ridge and LassoCV, "Linear Regression" seems to be better than others.

# 4) XGBoost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = Y_train)

params = {"max_depth":3, "eta":0.1}

cross_val = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    early_stopping_rounds=50)
cross_val

In [None]:
plt.ﬁgure(ﬁgsize=(8, 6))
plt.plot(cross_val.loc[10:,["test-rmse-mean", "train-rmse-mean"]])
plt.grid()
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_xgb = xgb.XGBRegressor(
    n_estimators=225,
    max_depth=3,
    learning_rate=0.1)
model_xgb.fit(X_train, Y_train)
pred3=model_xgb.predict(X_test)

print('xgboost RMSE loss:')
print(rmse_cv(model_xgb).mean())
print(np.sqrt(mean_squared_error(Y_test, pred3)))

# XGboost seems to be better than Linear Regression.

In [None]:
xgb.plot_importance(model_xgb)

# Drop the columns 'ZN' and 'CHAS' which include over 50% '0'.

In [None]:
X_train1=X_train.drop(['ZN','CHAS'],axis=1)
X_test1=X_test.drop(['ZN','CHAS'],axis=1)

In [None]:
dtrain = xgb.DMatrix(X_train1, label = Y_train)

params = {"max_depth":3, "eta":0.1}

cross_val = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    early_stopping_rounds=50)
cross_val

In [None]:
plt.ﬁgure(ﬁgsize=(8, 6))
plt.plot(cross_val.loc[10:,["test-rmse-mean", "train-rmse-mean"]])
plt.grid()
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_xgb = xgb.XGBRegressor(
    n_estimators=170,
    max_depth=3,
    learning_rate=0.1)
model_xgb.fit(X_train1, Y_train)
pred4=model_xgb.predict(X_test1)

print('xgboost RMSE loss:')
print(rmse_cv(model_xgb).mean())
print(np.sqrt(mean_squared_error(Y_test, pred4)))