In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston


In [2]:
boston = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
x = pd.DataFrame(data = boston.data, columns = boston.feature_names)

In [4]:
y =  pd.DataFrame(data = boston.target, columns = ['price'])
y

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [5]:
x.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [6]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [7]:
# every data is float so there is no need to do any categorical handling variable.

In [8]:
x.duplicated().sum()

0

In [9]:
y.isnull().sum()

price    0
dtype: int64

In [10]:
# This is very small dataset so we don't check for outliers.

In [11]:
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
arr = sc.fit_transform(x)

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()

In [14]:
vif['vif'] = [variance_inflation_factor(arr, i) for i in range(arr.shape[1])]


In [15]:
vif['features'] = x.columns
vif

Unnamed: 0,vif,features
0,1.792192,CRIM
1,2.298758,ZN
2,3.991596,INDUS
3,1.073995,CHAS
4,4.39372,NOX
5,1.933744,RM
6,3.100826,AGE
7,3.955945,DIS
8,7.484496,RAD
9,9.008554,TAX


### So We can see here there is no any variable which has more than 10% of variance inflation facotr.

### Split the data into train and test set

In [16]:
from sklearn.model_selection import train_test_split
X_train ,x_test ,y_train ,y_test = train_test_split(arr, y, test_size = 0.2, random_state = 32)

### First of trainning with simle linear regression

In [17]:
from sklearn.linear_model import LinearRegression
simpleLinear = LinearRegression()
simpleLinear.fit(X_train ,y_train)

LinearRegression()

In [18]:
simpleLinear.score(x_test, y_test)

0.6846868305630175

In [19]:
from sklearn.linear_model import Ridge, RidgeCV
ridgecv =  RidgeCV(alphas = np.random.uniform(0,10,50), cv = 10)
ridgecv.fit(X_train, y_train)

RidgeCV(alphas=array([9.33357539, 2.03438949, 8.31003641, 1.61383031, 0.26242415,
       7.67542519, 4.718474  , 6.20435434, 1.92515515, 6.80416137,
       5.86978522, 8.60621851, 7.25860653, 1.45030438, 8.53573896,
       4.96164654, 7.44099997, 2.45850848, 8.63448662, 9.2168792 ,
       9.67982544, 5.51049179, 6.92518634, 0.79643929, 1.68332388,
       2.04473109, 4.59554775, 0.37497415, 3.31246755, 9.05614962,
       6.11947756, 2.74121888, 3.96891363, 3.81046783, 2.12068029,
       0.87755164, 9.25868928, 6.42281987, 0.82783663, 9.80228823,
       8.33899747, 5.7159696 , 8.66217693, 7.03807369, 0.40706158,
       7.98499453, 3.17162463, 6.18048145, 0.92317736, 8.59774886]),
        cv=10)

In [20]:
ridgecv.score(x_test ,y_test)

0.6789265970960716

In [21]:
#  Linear regression with using stocastic linear regression
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train ,y_train)
sgd.score(x_test ,y_test)

  y = column_or_1d(y, warn=True)


0.6832720432770775

In [22]:
# there is no any changes . so we will use grid search to find the best parameters in the model.

In [23]:
simpleLinear.score(x_test ,y_test)

0.6846868305630175

In [24]:
# we are saving this for future refrence.

In [25]:
import pickle
pickle.dump(simpleLinear, open('lr.pkl', 'wb'))