## Linear Regression

In [6]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [7]:
from sklearn.datasets import load_boston

In [8]:
data = load_boston()

In [9]:
df = pd.DataFrame(data.data,columns=data.feature_names)

In [10]:
df['target'] = data.target

In [11]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [13]:
df.rename(columns = {'target':'price'}, inplace = True)

In [14]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [15]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [16]:
X.shape
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [18]:
lin_reg = LinearRegression()
cross_score = cross_val_score(lin_reg,X,Y,scoring='neg_mean_squared_error',cv = 6)
np.mean(cross_score)

-39.117136301064455

In [19]:
lin_reg.fit(X,Y)

LinearRegression()

In [20]:
lin_reg.predict([[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98]])

array([30.00384338])

## Ridge Regression (used to avoid overfiiteing)

In [21]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [22]:
ridge_reg = Ridge()
params = {'alpha' : [1e-10,1e-7,1e-4,1e-1,1,10,50,100]}  # alpha is lambda in ridge regression notes
grid_cv = GridSearchCV(ridge_reg,params,scoring='neg_mean_squared_error',cv=5)
grid_cv.fit(X,Y)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-10, 1e-07, 0.0001, 0.1, 1, 10, 50, 100]},
             scoring='neg_mean_squared_error')

In [23]:
grid_cv.best_params_

{'alpha': 100}

In [24]:
grid_cv.best_score_

-29.90570194754033

In [25]:
# let's do scaling and then figure out what happens 

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(df.drop('price',axis=1))
scale_values = scale.transform(df.drop('price',axis=1))

In [26]:
scaled_df = pd.DataFrame(scale_values,columns=df.columns[0:-1])

In [27]:
scaled_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


In [28]:
ridge_reg = Ridge()
params = {'alpha' : [1e-10,1e-7,1e-4,1e-1,1,10,50,100]}  # alpha is lambda in ridge regression notes
grid_cv = GridSearchCV(ridge_reg,params,scoring='neg_mean_squared_error',cv=5)
grid_cv.fit(scaled_df,df['price'])

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-10, 1e-07, 0.0001, 0.1, 1, 10, 50, 100]},
             scoring='neg_mean_squared_error')

In [29]:
grid_cv.best_params_

{'alpha': 100}

In [30]:
grid_cv.best_score_

-32.341768593919575

In [31]:
#as ridge is performing bad than simple linear regression because higher minus value is more lesser

## Lasso Regressioin

In [32]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso()
params = {'alpha': [1e-10,1e-7,1e-4,1e-1,1,10,50,100]}
grid_cv = GridSearchCV(lasso_reg,params,scoring='neg_mean_squared_error',cv=5)
grid_cv.fit(scaled_df,df['price'])

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-10, 1e-07, 0.0001, 0.1, 1, 10, 50, 100]},
             scoring='neg_mean_squared_error')

In [33]:
grid_cv.best_params_

{'alpha': 0.1}

In [34]:
grid_cv.best_score_

-35.62739011281294

In [35]:
# so , therefore the simple linear regression model performs best for the given dataset 

In [36]:
#let se the accuracy of model using r^2 square and adjusted r^2

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [39]:
from sklearn.metrics import r2_score
lin_reg.fit(x_train,y_train)
prediction = lin_reg.predict(x_test)

In [40]:
print(r2_score(prediction,y_test))

0.6301863321720909
