In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

: 

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Load the Boston House Pricing Dataset

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston = load_boston()

In [None]:
type(boston)

In [None]:
boston.keys()

In [None]:
# Lets check the description of dataset
print(boston.DESCR)

In [None]:
print(boston.data)

In [None]:
print(boston.feature_names)

In [None]:
print(boston.target)

In [None]:
print(boston.filename)

In [None]:
print(boston.data_module)

## Preparaing the dataset

In [None]:
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['Price'] = boston.target

In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
#Summarizing the stats of the data
dataset.describe()

In [None]:
## Check the missing values

In [None]:
dataset.isnull().sum()

In [None]:
import missingno as mn

In [None]:
mn.bar(dataset)

From the above visualisation we can observe that there is perticular no missing values is there in the dataframe.

### Exploratory Data Analysis

 Check Correlation among features.

In [None]:
dataset.corr()

In [None]:
import seaborn as sns

In [None]:
correlation_matrix = dataset.corr().round(2)
plt.figure(figsize = (12,10))
sns.heatmap(data=correlation_matrix, annot=True) # annot = True to print the values inside the square

To fit a linear regression model, we select those features which have a high correlation with our target variable Price. By looking at the correlation matrix we can see that RM has a strong positive correlation with Price (0.7) where as LSTAT has a high negative correlation with Price(-0.74).

An important point in selecting features for a linear regression model is to check for multi-co-linearity. The features RAD, TAX have a correlation of 0.91. These feature pairs are strongly correlated to each other. We should not select both these features together for training the model. Check this for an explanation. Same goes for the features DIS and AGE which have a correlation of -0.75.

### Analysing Correlated Features

In [None]:
plt.scatter(dataset['CRIM'], dataset['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("Price")

In [None]:
sns.regplot(x='CRIM', y='Price', data=dataset)

As the crime rate is increasing price of the house decreasing which is negatively correlated.

In [None]:
plt.scatter(dataset['RM'], dataset['Price'])
plt.xlabel("Avgerage Room")
plt.ylabel("Price")

In [None]:
sns.regplot(x='RM', y='Price', data=dataset)

As the average nos of room is increasing price of the house increasing which is positive correlation.

In [None]:
plt.scatter(dataset['LSTAT'], dataset['Price'])
plt.xlabel("% lower status of the population")
plt.ylabel("Price")

In [None]:
sns.regplot(x='LSTAT', y='Price', data=dataset)

As the % lower status of the population is increases price of the house decreases which is negatively correlated.

In [None]:
sns.regplot(x='CHAS', y='Price', data=dataset)

Here no correlation is there.

In [None]:
sns.regplot(x='PTRATIO', y='Price', data=dataset)

As the PTRATIO increases increases Price decreases viceversa which is negatively correlated.

### Split the dataset into Independent and Dependent features.

In [None]:
x = dataset.iloc[:,:-1]
y = dataset['Price']

In [None]:
x

In [None]:
y

### Split the dataset into train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

### Before model training we need to do standard scalling.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
x_train = scaler.fit_transform(x_train)

In [None]:
x_test = scaler.transform(x_test) 

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

Here we didn't fit the x_test because we are going to make sure that what ever the information i have regarding the train data set and what ever techniques i have applied to for transforming it the same techniques needs to be applied to the test dataset. This does because my model shouldn't know much information about test dataset.  

In [None]:
x_train

In [None]:
x_test

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()

In [None]:
regression.fit(x_train,y_train)

In [None]:
## print the coefficients and the intercept
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameters the model has been trained
regression.get_params()

In [None]:
### prediction with test data
reg_pred = regression.predict(x_test)

In [None]:
reg_pred

### Assumptions

In [None]:
## plot a scatter plot for the prediction
plt.scatter(y_test, reg_pred)

In [None]:
residuals = y_test - reg_pred

In [None]:
residuals

In [None]:
## plot this residuals

In [None]:
sns.displot(residuals, kind="kde")

In [None]:
## scaatter plot w.r.t prediction and residuals
## uniform distribution
plt.scatter(reg_pred, residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_squared_error(y_test, reg_pred))
print(mean_absolute_error(y_test, reg_pred))
print(np.sqrt(mean_squared_error(y_test, reg_pred)))

### R Square and Adjusted R Square

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)
score

In [None]:
1 - (1-score)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)

### New Data Prediction

In [None]:
boston.data

In [None]:
boston.data[0].shape #Let's take 1st data point and predict

In [None]:
boston.data[0].reshape(1,-1) #Model should be in 2d array format.

In [None]:
## Transformation of new data
scaler.transform(boston.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

### Pickling The Model file For Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression, open('regmodel.pkl','wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl','rb'))

In [None]:
## Prediction 
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))