In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
boston.keys()

In [None]:
print(boston.DESCR)

In [None]:
print(boston.data)

In [None]:
boston.target

In [None]:
boston.feature_names

In [None]:
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['price'] = boston.target

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
#Summerizing the stats of the data
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.corr()

In [None]:
sns.pairplot(dataset)

In [None]:
plt.scatter(dataset['CRIM'],dataset['price'])
plt.xlabel("CRIME RATE")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['ZN'],dataset['price'])
plt.xlabel("proportion of residential land zoned for lots over 25,000 sq.ft.")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['INDUS'],dataset['price'])
plt.xlabel("proportion of non-retail business acres per town")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.figure()
sns.barplot(x='CHAS', y='price', data=dataset)
plt.xlabel("Charles River dummy variable")

In [None]:
sns.regplot(x='NOX',y='price', data=dataset)
plt.xlabel("nitric oxides concentration (parts per 10 million)")
plt.ylabel("PRICE")
plt.show()

In [None]:
sns.regplot(x='RM',y='price', data=dataset)
plt.xlabel("Average number of rooms per dwelling")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['AGE'],dataset['price'])
plt.xlabel("Proportion of owner-occupied units built prior to 1940")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['DIS'],dataset['price'])
plt.xlabel("Weighted distances to five Boston employment centres")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['RAD'],dataset['price'])
plt.xlabel("index of accessibility to radial highways")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['TAX'],dataset['price'])
plt.xlabel("Full-value property-tax rate per $10,000")
plt.ylabel("PRICE")
plt.show()

In [None]:
sns.regplot(x='PTRATIO',y='price', data=dataset)
plt.xlabel("pupil-teacher ratio by town")
plt.ylabel("PRICE")
plt.show()

In [None]:
plt.scatter(dataset['B'],dataset['price'])
plt.xlabel("1000(Bk - 0.63)^2 where Bk is the proportion of black people by town")
plt.ylabel("PRICE")
plt.show()

In [None]:
sns.regplot(x='LSTAT',y='price', data=dataset)
plt.xlabel("% lower status of the population")
plt.ylabel("PRICE")
plt.show()

In [None]:
x=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
x_train = scaler.fit_transform(x_train)

In [None]:
x_test = scaler.transform(x_test)

In [None]:
x_train

In [None]:
x_test

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [None]:
#print the coefficients and intercept

In [None]:
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
#on which parameters the mdel has been trained
regression.get_params()

In [None]:
#prediction with test data
reg_pred = regression.predict(x_test)
print(reg_pred)

In [None]:
#plot a scatter plot for the prediction
plt.scatter(y_test, reg_pred)

In [None]:
residuals = y_test-reg_pred
residuals

In [None]:
#plot residials
sns.displot(residuals, kind='kde')

In [None]:
#scatter plot w.r.t prediction and residuals
plt.scatter(reg_pred, residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print("mean_absolute_error    : ",mean_absolute_error(y_test, reg_pred))
print("mean_squared_error     : ",mean_squared_error(y_test, reg_pred))
print("Root Mean squared error: ",np.sqrt(mean_squared_error(y_test, reg_pred)))

In [None]:
# R-Square and adjusted R-Square

In [None]:
# Formula :  R^2 = 1 - SSR/SST
# R^2: Co-efficient of determination, SSR: Sum of squares of residuals, SST: Total sum of squares

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)
print("r2 score: ",score)

In [None]:
# Adjusted r^2 = 1-[(1-r^2)*(n-1)/(n-k-1)]
# where r^2 of the model n: the number of observations k: the number of predictor variable

In [None]:
print("Adjusted r^2 : ",1-(1-score)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1))

In [None]:
# New Data prediction

In [None]:
boston.data.shape

In [None]:
boston.data[0].shape

In [None]:
boston.data[0].reshape(1,-1)

In [None]:
scaler.transform(boston.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

In [None]:
#pickling the file for Deployment
import pickle

In [None]:
pickle.dump(regression, open('regmodel.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
#prediction
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))