In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## <center> Boston House Price Predction </center>

<center> <img src="https://media.thestar.com.my/Prod/D4A838DE-5A4E-4A3B-B970-C7E9E6A8EB7B" > </center>

#### Following are the list of algorithms that are used in this notebook 

|     Algorithms     |
| ------------------ | 
| Linear Regression  |
| Decision Tree      | 
| Random Forest      |
| XGBoost            | 

In [None]:
# To prevent from warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing Libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from statsmodels.graphics.gofplots import qqplot
import seaborn as sns
%matplotlib inline

<!-- Attribute Information

Input features in order:
1) CRIM: per capita crime rate by town
2) ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3) INDUS: proportion of non-retail business acres per town
4) CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5) NOX: nitric oxides concentration (parts per 10 million) [parts/10M]
6) RM: average number of rooms per dwelling
7) AGE: proportion of owner-occupied units built prior to 1940
8) DIS: weighted distances to five Boston employment centres
9) RAD: index of accessibility to radial highways
10) TAX: full-value property-tax rate per $10,000 [$/10k]
11) PTRATIO: pupil-teacher ratio by town
12) B: The result of the equation B=1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13) LSTAT: % lower status of the population

Output variable:
1) MEDV: Median value of owner-occupied homes in $1000's [k$] -->

In [None]:
# Reading the dataset
df = pd.read_csv('/kaggle/input/the-boston-houseprice-data/boston.csv')

In [None]:
# Display top 5 rows of the datasets
df.head()

In [None]:
# Getting the shape of dataset ( means number of rows and column)
df.shape

In [None]:
# To view some basic statistical details 
df.describe()

In [None]:
# getting the information about dataframe
df.info()

In [None]:
#  check for null value 
df.isnull().sum()

In [None]:
# checking number of unique values in each column
df.nunique()

* Here we can see that few variables are categorical but they are present in integer format.

In [None]:
# Visualizing the presence of null value using heatmap
sns.heatmap(df.isnull())

In [None]:
# correlation using heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot = True, cmap='coolwarm')

In [None]:
# Unstacking the correlation values to check the correlation between the feature columns

corr = df.corr()
c1 = corr.abs().unstack()
c1.sort_values(ascending = False)[14:28:2]

In [None]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['LSTAT'], ax=ax1 , color ='red')
ax1.set(title='LSTAT distribution')
qqplot(df['LSTAT'], ax=ax2, line='s')
ax2.set(title='LSTAT quantile plot')

* In similar manner we can check the normality and skewness of each variables.

In [None]:
skew_val = df.skew().sort_values(ascending=False)
skew_val

* Shortcut for checking Normality and Skewness using pandas skew function.
* If the skewness value is between 0.5 and -0.5 then it is normal distribution else will be right of left depending upon data.

In [None]:
# Checking outliers using box plot

fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

<!-- As we can see that is above box plot visualization that "CRIM", "ZN", "B", has so many outliers -->

In [None]:
# Individual box plot for each feature
def Box(df):
    plt.title("Box Plot")
    sns.boxplot(df)
    plt.show()
Box(df['CRIM'])

* Above data is clearly right skewed as outliers are present in right side.

In [None]:
# Individual histogram for each feature
def hist_plots(df):
    plt.title("Histogram")
    plt.hist(df)
    plt.show()
hist_plots(df['CRIM'])

In [None]:
# Individual Distribution plot for each feature
def dist_plots(df):
    plt.title("Distribution Plot")
    sns.distplot(df)
    plt.show()
dist_plots(df['CRIM'])

In [None]:
df.nunique()

In [None]:
fig = px.box(df, x="CHAS", y="MEDV", color="CHAS", width=800, height=400)
fig.show()

In [None]:
fig = px.box(df, x="RAD", y="MEDV", color="RAD")
fig.show()

In [None]:
fig =  px.pie (df, names = "CHAS", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig = px.scatter (df, x = "MEDV", y = "RM", color = "CHAS", template = "plotly_dark",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, x = "MEDV", y = "DIS", color = "ZN", template = "plotly_dark",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, x = "MEDV", y = "AGE", color = "ZN", template = "plotly_dark",  trendline="lowess")
fig.show ()

In [None]:
fig = px.scatter (df, x = "MEDV", y = "DIS", color = "RAD", template = "plotly_dark",  trendline="lowess")
fig.show ()

In [None]:
# Min-Max normalization is used to  bring the values in a particular arrange ( i.e. here 0 to 1)

# Here we are taking only 4 column for normalization because in this the value are too high as compare to others

cols = ['CRIM', 'ZN', 'TAX', 'B']
for col in cols:
    minimum = min(df[col])
    maximum = max(df[col])
    df[col] = (df[col] - minimum)/ (maximum - minimum)
    

In [None]:
# Here we can see that values are now between 0 and 1

fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# Now values are between 0 and 1 
def hist_plots(df):
    plt.title("Histogram")
    plt.hist(df)
    plt.show()
hist_plots(df['CRIM'])

In [None]:
# Here we can see that after min-max normalization values now ranges from 0 to 1
df.head()

In [None]:
# dropping 'MEDV' from dataframe and saving dataframe in X which is now acting as input column
X = df.drop(columns=['MEDV'], axis=1)
X.head()

In [None]:
X.shape

In [None]:
# y have only 'MEDV' column which is the output column
y = df['MEDV']
y.shape

## Linear Regression

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Performing train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)

In [None]:
# creating object of Linear Regression
model_lrg = LinearRegression()

In [None]:
# Training model
model_lrg.fit(X_train, y_train)

In [None]:
# Predicting values
pred_tst = model_lrg.predict(X_test)

In [None]:
# Evaluation metrics "Mean Squared Error"
mae_lrg = np.sqrt(mean_squared_error(y_test, pred_tst))
print(mae_lrg)

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# creating object of Random Forest Regressor
model_rf = RandomForestRegressor()

In [None]:
# Training model
model_rf.fit(X_train, y_train)

In [None]:
# Predicting values
pred_rf = model_rf.predict(X_test)

In [None]:
# Evaluation metrics "Mean Squared Error"
mae_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
print(mae_rf)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# creating object of Decision Tree Regressor
model_dt = DecisionTreeRegressor()

In [None]:
# Training model 
model_dt.fit(X_train, y_train)

In [None]:
# Predicting values
pred_dt = model_dt.predict(X_test)

In [None]:
# Evaluation metrics "Mean Squared Error"
mae_dt = np.sqrt(mean_squared_error(y_test, pred_dt))
print(mae_dt)

### XGBoost

In [None]:
from xgboost import XGBRFRegressor

In [None]:
# creating object of XGBoost
model_xgb = XGBRFRegressor(max_depth=8, n_estimators = 10)

In [None]:
# Training model
model_xgb.fit(X_train, y_train)

In [None]:
# Predicting values
pred_xgb = model_xgb.predict(X_test)

In [None]:
# Evaluation metrics "Mean Squared Error"
mae_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))
print(mae_xgb)

In [None]:
models = pd.DataFrame({
    'Model':['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost'],
    'MAE' :[mae_lrg, mae_dt, mae_rf, mae_xgb]
})
models

##### Conclusion :- Here after changing few hyperparameter XGBoost is working best here