### Model Building

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


In [4]:
os.getcwd()
os.chdir('../')

In [27]:
df= pd.read_csv('data/Boston.csv',index_col='Unnamed: 0')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 1 to 506
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 59.3 KB


In [6]:
pred_col= ['indus', 'nox', 'rm', 'tax', 'ptratio', 'lstat', 'dis' , 'age' ]
target_var=['medv']
## using log transformation for lstat, nox, ptratio, dis
df['lstat']= np.log(df['lstat'])

df['ptratio']= np.log(df['ptratio'])
df['dis']= np.log(df['dis'])

In [8]:
from sklearn.model_selection import train_test_split

x= df[pred_col]
y=df[target_var]

x_train, x_test, y_train, y_test= train_test_split(x, y , test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((404, 8), (102, 8), (404, 1), (102, 1))

In [11]:
transformer= StandardScaler()
transformer.fit(x_train)
x_train_trans= transformer.transform(x_train)
x_test_trans= transformer.transform(x_test)

cv_scores= cross_val_score(LinearRegression(), X=x_train_trans, y=y_train)
cv_scores

array([0.74848541, 0.72946562, 0.7600978 , 0.75870403, 0.80919565])

### Regression Analysis

In [22]:
import statsmodels.api as sm
x_train_trans_2= pd.DataFrame(x_train_trans, columns= x_train.columns)
X= sm.add_constant(x_train_trans)
model= sm.OLS(y_train,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.775
Model:,OLS,Adj. R-squared:,0.77
Method:,Least Squares,F-statistic:,169.7
Date:,"Tue, 18 Jul 2023",Prob (F-statistic):,9.59e-123
Time:,22:30:03,Log-Likelihood:,-1166.7
No. Observations:,404,AIC:,2351.0
Df Residuals:,395,BIC:,2387.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,22.4168,0.219,102.548,0.000,21.987,22.847
x1,-0.2229,0.417,-0.534,0.594,-1.043,0.598
x2,-2.4191,0.511,-4.735,0.000,-3.424,-1.415
x3,1.9852,0.309,6.417,0.000,1.377,2.593
x4,-0.3863,0.354,-1.092,0.276,-1.082,0.309
x5,-1.5145,0.267,-5.677,0.000,-2.039,-0.990
x6,-5.8726,0.388,-15.136,0.000,-6.635,-5.110
x7,-3.3603,0.484,-6.946,0.000,-4.311,-2.409
x8,0.4617,0.407,1.134,0.258,-0.339,1.263

0,1,2,3
Omnibus:,112.315,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,462.28
Skew:,1.162,Prob(JB):,4.14e-101
Kurtosis:,7.697,Cond. No.,6.14


In [28]:
from sklearn.base import BaseEstimator, TransformerMixin

class LogTransformer(BaseEstimator, TransformerMixin) :
    def __init__(self):
        self.cols= ['lstat','nox','ptratio','dis']
    
    def fit(self,x):
        return self
    
    def transform(self,x):
        for var in self.cols :
            x[var]= np.log(x[var])
        return x
    
trans=LogTransformer()
x_df= df[0:5][pred_col]

In [29]:
x_df

Unnamed: 0,indus,nox,rm,tax,ptratio,lstat,dis,age
1,2.31,0.538,6.575,296,15.3,4.98,4.09,65.2
2,7.07,0.469,6.421,242,17.8,9.14,4.9671,78.9
3,7.07,0.469,7.185,242,17.8,4.03,4.9671,61.1
4,2.18,0.458,6.998,222,18.7,2.94,6.0622,45.8
5,2.18,0.458,7.147,222,18.7,5.33,6.0622,54.2


In [30]:
trans.fit(x_df)
trans.transform(x_df)

Unnamed: 0,indus,nox,rm,tax,ptratio,lstat,dis,age
1,2.31,-0.619897,6.575,296,2.727853,1.60543,1.408545,65.2
2,7.07,-0.757153,6.421,242,2.879198,2.21266,1.602836,78.9
3,7.07,-0.757153,7.185,242,2.879198,1.393766,1.602836,61.1
4,2.18,-0.780886,6.998,222,2.928524,1.07841,1.802073,45.8
5,2.18,-0.780886,7.147,222,2.928524,1.673351,1.802073,54.2
