In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston


In [6]:
boston = load_boston()

In [14]:
x = pd.DataFrame(data = boston.data, columns = boston.feature_names)

In [16]:
y =  pd.DataFrame(data = boston.target, columns = ['price'])
y

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [18]:
x.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [19]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [20]:
# every data is float so there is no need to do any categorical handling variable.

In [23]:
x.duplicated().sum()

0

In [24]:
y.isnull().sum()

price    0
dtype: int64

In [27]:
# This is very small dataset so we don't check for outliers.

In [29]:
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [32]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
arr = sc.fit_transform(x)

In [33]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()

In [34]:
vif['vif'] = [variance_inflation_factor(arr, i) for i in range(arr.shape[1])]


In [36]:
vif['features'] = x.columns
vif

Unnamed: 0,vif,features
0,1.792192,CRIM
1,2.298758,ZN
2,3.991596,INDUS
3,1.073995,CHAS
4,4.39372,NOX
5,1.933744,RM
6,3.100826,AGE
7,3.955945,DIS
8,7.484496,RAD
9,9.008554,TAX


### So We can see here there is no any variable which has more than 10% of variance inflation facotr.

### Split the data into train and test set

In [38]:
from sklearn.model_selection import train_test_split
X_train ,x_test ,y_train ,y_test = train_test_split(arr, y, test_size = 0.2, random_state = 32)

### First of trainning with simle linear regression

In [44]:
from sklearn.linear_model import LinearRegression
simpleLinear = LinearRegression()
simpleLinear.fit(X_train ,y_train)

LinearRegression()

In [45]:
simpleLinear.score(x_test, y_test)

0.6846868305630175

In [48]:
from sklearn.linear_model import Ridge, RidgeCV
ridgecv =  RidgeCV(alphas = np.random.uniform(0,10,50), cv = 10)
ridgecv.fit(X_train, y_train)

RidgeCV(alphas=array([1.53552242, 9.02126456, 9.66427065, 9.97812326, 0.27860219,
       8.51584679, 5.82336759, 5.71088617, 5.72857718, 0.80496591,
       7.10384947, 2.97391975, 2.02333199, 3.63915652, 5.53333642,
       5.00130167, 0.9985606 , 5.91641518, 5.92253799, 8.13955051,
       1.57286991, 1.43352784, 7.09686283, 6.77655866, 7.51150674,
       1.28252651, 7.04257603, 6.23192952, 4.83188098, 7.15934013,
       6.96148475, 6.97364605, 5.95616736, 1.82101831, 7.17989042,
       3.77743294, 5.54208907, 0.8240315 , 9.32502808, 2.64163437,
       9.68734123, 7.80716574, 9.29140991, 0.31641689, 1.54938263,
       1.20697555, 7.12888555, 1.63936822, 1.88827748, 5.09659158]),
        cv=10)

In [49]:
ridgecv.score(x_test ,y_test)

0.6788201119963948

In [50]:
#  Linear regression with using stocastic linear regression
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train ,y_train)
sgd.score(x_test ,y_test)

  y = column_or_1d(y, warn=True)


0.678832749887837

In [51]:
# there is no any changes . so we will use grid search to find the best parameters in the model.

In [53]:
simpleLinear.score(x_test ,y_test)

0.6846868305630175

In [54]:
# we are saving this for future refrence.

In [56]:
import pickle
pickle.dump(simpleLinear, open('lr.pkl', 'wb'))