In [None]:
# Importing the libraries 

import pandas as pd #Data Processing
import numpy as np # Linear algebra functions
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print("Setup Complete")



In [None]:
# Importing the Boston Housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys() 

In [None]:
#boston.data # predictors
#boston.target #target
#boston.DESCR # description od dataset
boston.feature_names # predictor names or column names for X's

In [None]:
#creating dataframe
data=pd.DataFrame(boston.data)
# top 5 rows of dataset
data.head()


In [None]:
#bottom 5 rows of dataset
data.tail()

In [None]:
#Renaming columns 
data.columns=boston.feature_names
data.columns

In [None]:
#Adding target to dataframe
data["price"]=boston.target

In [None]:
# Dimension of the dataset
print(np.shape(data))


In [None]:
data.head()

In [None]:
data.isnull().sum()


In [None]:
# distribution of data
print(data.describe())

In [None]:
corr = data.corr()
corr


In [None]:
# Plotting the heatmap of correlation between features
plt.figure(figsize=(20,10))
sns.heatmap(corr, cbar=True, square= False, fmt='.1f', annot=True, annot_kws={'size':10}, cmap='Blues')


In [None]:
# Spliting target variable and independent variables
X = data.drop(['price'], axis = 1)
y = data['price']

In [None]:
# Splitting to training and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 123)

In [None]:
#X_train 
#X_test
#y_train
#y_test

In [None]:
# library for Linear Regression
from sklearn.linear_model import LinearRegression

# Create a Linear regressor
model = LinearRegression()

# Train the model using the training sets 
model.fit(X_train, y_train)

In [None]:
# Value of y intercept
model.intercept_

In [None]:
#Coefficients for each X
model.coef_

In [None]:
#Creating a dataframe with X's and coeff
coefficients = pd.DataFrame([X_train.columns,model.coef_]).T #Transpose row to col


In [None]:
coefficients

In [None]:
coefficients = coefficients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coefficients

In [None]:
# Model prediction on train data
y_pred = model.predict(X_train)

In [None]:
#Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE of train:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

# we noticed that R-square and Adj- Rsquare are almost same.

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
# Checking residuals/errors
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()

#we have observed from the below fig. that there is no pattern followed for errors. So our assumption for linear regression is satisfied

In [None]:
sns.distplot(y_train-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()
#here the errors are normally distributed. Assumption satisfied.

In [None]:
# on test data
y_test_pred = model.predict(X_test)


In [None]:
# Model Evaluation
print('R^2:', metrics.r2_score(y_test, y_test_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE of Test:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

In [None]:
print('RMSE of train:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))
print('RMSE of Test:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
#RMSE of both train and test are almost same. Therefore the model is not overfitting