In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/boston-house-prices/housing.csv')
df.head()

In [None]:
df.shape

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv('../input/boston-house-prices/housing.csv', header=None, names=column_names, delimiter='\s+')
df.head()

In [None]:
df.shape

Here's what our data columns represent:

- **CRIM** - per capita crime rate by town
- **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
- **INDUS** - proportion of non-retail business acres per town.
- **CHAS** - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
- **NOX** - nitric oxides concentration (parts per 10 million)
- **RM** - average number of rooms per dwelling
- **AGE** - proportion of owner-occupied units built prior to 1940
- **DIS** - weighted distances to five Boston employment centres
- **RAD** - index of accessibility to radial highways
- **TAX** - full-value property-tax rate per 10,000 dollars
- **PTRATIO**: pupil-teacher ratio by town
- **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- **LSTAT** - % lower status of the population
- **MEDV** - Median value of owner-occupied homes in $1000's

We are trying to predict the last column - MEDV (median house value of owner occupied homes in 1000's)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
corr = df.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(corr, linewidths=0.2, annot=True);

In [None]:
sns.scatterplot(data=df, x='MEDV', y='CRIM');

In [None]:
sns.set_style('darkgrid')

In [None]:
sns.scatterplot(data=df, x='MEDV', y='CRIM');

In [None]:
pg = sns.PairGrid(df)
pg.map_diag(sns.histplot)
pg.map_offdiag(sns.scatterplot)

Variable #14 seems to be censored at 50.00 (corresponding to a median price of $50,000). Based on that, values above 50.00 may not help to predict MEDV.

In [None]:
df = df[~(df['MEDV'] >= 50.00)]
df.shape

In [None]:
sns.boxplot(data=df, y='CRIM');

In [None]:
def var_boxplot(df):
    fig, axs = plt.subplots(ncols=df.shape[1]//2, nrows=2, figsize=(20, 15))
    
    index = 0
    axs = axs.flatten()
    
    for k, v in df.items():
        sns.boxplot(y=k, data=df, ax=axs[index])
        index += 1
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
var_boxplot(df)

In [None]:
def var_distplot(df):
    fig, axs = plt.subplots(ncols=df.shape[1]//2, nrows=2, figsize=(20, 10))
    
    index = 0
    axs = axs.flatten()
    
    for k, v in df.items():
        sns.distplot(v, ax=axs[index])
        index += 1
    plt.tight_layout(pad=0.4, w_pad=1.0, h_pad=5.0);

In [None]:
import warnings
warnings.simplefilter('ignore')

var_distplot(df)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
coefficients = pd.DataFrame([X_train.columns, lin_reg.coef_]).T
coefficients = coefficients.rename(columns={0: 'Attribute', 1: 'Coefficient'})
coefficients

In [None]:
y_pred_lin_reg = lin_reg.predict(X_train) 

In [None]:
# Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
print('Mean Absolute Error of Linear Regression: {}'.format(mean_absolute_error(y_train, y_pred_lin_reg)))
print('Mean Squared Error of Linear Regression: {}'.format(mean_squared_error(y_train, y_pred_lin_reg)))
print('R^2 Score of Linear Regression: {}'.format(r2_score(y_train, y_pred_lin_reg)))
print('Root Mean Squared Error of Linear Regression: {}'.format(np.sqrt(mean_absolute_error(y_train, y_pred_lin_reg))))

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

X_train_scaled = min_max_scaler.fit_transform(X_train)

In [None]:
y_preds_scaled = lin_reg.predict(X_train_scaled)

In [None]:
print('Mean Absolute Error of Linear Regression: ', mean_absolute_error(y_train, y_preds_scaled))
print('Mean Squared Error of Linear Regression: ', mean_squared_error(y_train, y_preds_scaled))
print('R^2 Score of Linear Regression: ', r2_score(y_train, y_preds_scaled))
print('Root Mean Squared Error of Linear Regression: ', np.sqrt(mean_absolute_error(y_train, y_preds_scaled)))

In [None]:
plt.figure(figsize=(15, 12))
plt.scatter(y_train, y_pred_lin_reg)
plt.xlabel('Prices')
plt.ylabel('Predicted Prices')
plt.title('Prices vs Predicted Prices')
plt.show();

In [None]:
plt.figure(figsize=(15, 12))
plt.scatter(y_pred_lin_reg, y_train-y_pred_lin_reg)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show();

In [None]:
plt.figure(figsize=(12, 10))
sns.distplot(y_train-y_pred_lin_reg)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show();

In [None]:
y_test_pred_lin_reg = lin_reg.predict(X_test)

In [None]:
print('Mean Absolute Error of Linear Regression on Test Set: {}'.format(mean_absolute_error(y_test, y_test_pred_lin_reg)))
print('Mean Squared Error of Linear Regression on Test Set: {}'.format(mean_squared_error(y_test, y_test_pred_lin_reg)))
print('R^2 Score of Linear Regression on Test Set: {}'.format(r2_score(y_test, y_test_pred_lin_reg)))
print('Root Mean Squared Error of Linear Regression on Test Set: {}'.format(np.sqrt(mean_absolute_error(y_test, y_test_pred_lin_reg))))

### **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg_model = RandomForestRegressor()

rf_reg_model.fit(X_train, y_train)

In [None]:
y_pred_rf_reg_model = rf_reg_model.predict(X_train)

In [None]:
print('Mean Absolute Error of Random Forest Regression: ', mean_absolute_error(y_train, y_pred_rf_reg_model))
print('Mean Squared Error of Random Forest Regression: ', mean_squared_error(y_train, y_pred_rf_reg_model))
print('R^2 Score of Random Forest Regression: ', r2_score(y_train, y_pred_rf_reg_model))
print('Root Mean Squared Error of Random Forest Regression: ', np.sqrt(mean_absolute_error(y_train, y_pred_rf_reg_model)))

In [None]:
plt.figure(figsize=(15, 12))
plt.scatter(y_train, y_pred_rf_reg_model)
plt.xlabel('Prices')
plt.ylabel('Predicted Prices')
plt.title('Prices vs Predicted Prices')
plt.show();

In [None]:
# check residuals
plt.figure(figsize=(15, 12))
plt.scatter(y_pred_rf_reg_model, y_train-y_pred_rf_reg_model)
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Predicted vs Residuals')
plt.show();

In [None]:
y_test_pred_rf_reg_model = rf_reg_model.predict(X_test)

In [None]:
print('Mean Absolute Error of Random Forest Regression on Test Set: ', mean_absolute_error(y_test, y_test_pred_rf_reg_model))
print('Mean Squared Error of Random Forest Regression on Test Set: ', mean_squared_error(y_test, y_test_pred_rf_reg_model))
print('R^2 Score of Random Forest Regression on Test Set: ', r2_score(y_test, y_test_pred_rf_reg_model))
print('Root Mean Squared Error of Random Forest Regression on Test Set: ', np.sqrt(mean_absolute_error(y_test, y_test_pred_rf_reg_model)))

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_reg = XGBRegressor()

In [None]:
xgb_reg.fit(X_train, y_train)

In [None]:
y_pred_xgb_reg = xgb_reg.predict(X_train)

In [None]:
print('Mean Absolute Error of XGBoost Regression: ', mean_absolute_error(y_train, y_pred_xgb_reg))
print('Mean Squared Error of XGBoost Regression: ', mean_squared_error(y_train, y_pred_xgb_reg))
print('R^2 Score of XGBoost Regression: ', r2_score(y_train, y_pred_xgb_reg))
print('Root Mean Squared Error of XGBoost Regression: ', np.sqrt(mean_absolute_error(y_train, y_pred_xgb_reg)))

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(y_train, y_pred_xgb_reg)
plt.xlabel('Prices')
plt.ylabel('Predicted Prices')
plt.title('Prices vs Predicted Prices')
plt.show();

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(y_pred_xgb_reg, y_train-y_pred_xgb_reg)
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Predicted vs Residuals')
plt.show();

In [None]:
y_test_pred_xgb_reg = xgb_reg.predict(X_test)

In [None]:
print('Mean Absolute Error of XGBoost Regression on Test Set: ', mean_absolute_error(y_test, y_test_pred_xgb_reg))
print('Mean Squared Error of XGBoost Regression on Test Set: ', mean_squared_error(y_test, y_test_pred_xgb_reg))
print('R^2 Score of XGBoost Regression on Test Set: ', r2_score(y_test, y_test_pred_xgb_reg))
print('Root Mean Squared Error of XGBoost Regression on Test Set: ', np.sqrt(mean_absolute_error(y_test, y_test_pred_xgb_reg)))

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(y_test, y_test_pred_xgb_reg)
plt.xlabel('Prices')
plt.ylabel('Predicted Prices')
plt.title('Prices vs Predicted Prices')
plt.show();

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(y_test_pred_xgb_reg, y_test-y_test_pred_xgb_reg)
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Predicted vs Residuals')
plt.show();

In [None]:
acc_lin_reg = r2_score(y_test, y_test_pred_lin_reg)
acc_rf_reg = r2_score(y_test, y_test_pred_rf_reg_model)
acc_xgb_reg = r2_score(y_test, y_test_pred_xgb_reg)

In [None]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest Regression', 'XGBoost Regression'],
    'R^2 Score': [acc_lin_reg*100, acc_rf_reg*100, acc_xgb_reg*100]
})
models.sort_values(by='R^2 Score', ascending=False)