# Chapter 6. Multiple Regression Analysis: Further Analysis 
[Home](http://solomonegash.com/) | [Stata](http://solomonegash.com/woodridge1/index.html) | [R](http://solomonegash.com/econometrics/rbook1/index.html)

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *

### Table 6.1  Determinants of College GPA

In [2]:
df = dataWoo('bwght')

bwght_ols1 = smf.ols(formula='bwght  ~ cigs  + faminc + 1', data=df).fit()
bwght_ols2 = smf.ols(formula='bwghtlbs  ~ cigs  + faminc + 1', data=df).fit()
bwght_ols3 = smf.ols(formula='bwght  ~ packs  + faminc + 1', data=df).fit()

print(summary_col([bwght_ols1, bwght_ols2, bwght_ols3],stars=True,float_format='%0.3f',
                  model_names=['bwght_ols1','bwght_ols2', 'bwght_ols3'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared)}))


               bwght_ols1 bwght_ols2 bwght_ols3
-----------------------------------------------
Intercept      116.974*** 7.311***   116.974***
               (1.049)    (0.066)    (1.049)   
R-squared      0.030      0.030      0.030     
R-squared Adj. 0.028      0.028      0.028     
cigs           -0.463***  -0.029***            
               (0.092)    (0.006)              
faminc         0.093***   0.006***   0.093***  
               (0.029)    (0.002)    (0.029)   
packs                                -9.268*** 
                                     (1.832)   
N              1388       1388       1388      
R2             0.030      0.030      0.030     
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


### Example6.1. Effects of pollution on housing prices

In [3]:
df = dataWoo('hprice2')
df1 = df[['price', 'nox', 'crime', 'rooms', 'dist', 'stratio']]

zprice = pd.DataFrame({"zprice":ss.zscore(df1.loc[:,"price"])})
znox = pd.DataFrame({"znox":ss.zscore(df1.loc[:,"nox"])})
zcrime = pd.DataFrame({"zcrime":ss.zscore(df1.loc[:,"crime"])})
zrooms = pd.DataFrame({"zrooms":ss.zscore(df1.loc[:,"rooms"])})
zdist = pd.DataFrame({"zdist":ss.zscore(df1.loc[:,"dist"])})
zstratio = pd.DataFrame({"zstratio":ss.zscore(df1.loc[:,"stratio"])})

df2 = pd.concat([zprice,znox,zcrime,zrooms,zdist,zstratio],axis=1) 


In [4]:
hprice_std = smf.ols(formula='zprice ~ znox + zcrime + zrooms + zdist + zstratio + 1', data=df2).fit()
print(hprice_std.summary())

                            OLS Regression Results                            
Dep. Variable:                 zprice   R-squared:                       0.636
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     174.5
Date:                Sun, 30 Jun 2024   Prob (F-statistic):          3.61e-107
Time:                        18:52:56   Log-Likelihood:                -462.53
No. Observations:                 506   AIC:                             937.1
Df Residuals:                     500   BIC:                             962.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   8.674e-17      0.027   3.21e-15      1.0

In [5]:
hprice_std = smf.ols(formula='zprice ~ znox + zcrime + zrooms + zdist + zstratio + 1', data=df2).fit()
print(hprice_std.summary())

                            OLS Regression Results                            
Dep. Variable:                 zprice   R-squared:                       0.636
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     174.5
Date:                Sun, 30 Jun 2024   Prob (F-statistic):          3.61e-107
Time:                        18:52:56   Log-Likelihood:                -462.53
No. Observations:                 506   AIC:                             937.1
Df Residuals:                     500   BIC:                             962.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   8.674e-17      0.027   3.21e-15      1.0

### Compare the result in Example 4.5.

In [6]:
import math
df['ldist'] = np.log(df['dist'])
hprice_log = smf.ols(formula='lprice ~ lnox + ldist + rooms + stratio + 1', data=df).fit()
print(hprice_log.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.584
Model:                            OLS   Adj. R-squared:                  0.581
Method:                 Least Squares   F-statistic:                     175.9
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           5.53e-94
Time:                        18:52:56   Log-Likelihood:                -43.495
No. Observations:                 506   AIC:                             96.99
Df Residuals:                     501   BIC:                             118.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     11.0839      0.318     34.843      0.0

### Equation (6.7)

In [7]:
hprice_eq6_7 = smf.ols(formula='lprice ~ lnox + rooms + 1', data=df).fit()
print(hprice_eq6_7.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     265.7
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           1.79e-79
Time:                        18:52:56   Log-Likelihood:                -83.009
No. Observations:                 506   AIC:                             172.0
Df Residuals:                     503   BIC:                             184.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      9.2337      0.188     49.184      0.0

### Equation (6.12)

In [8]:
df = dataWoo('wage1')
wage_exp = smf.ols(formula='wage ~ exper + expersq + 1', data=df).fit()
print(wage_exp.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.093
Model:                            OLS   Adj. R-squared:                  0.089
Method:                 Least Squares   F-statistic:                     26.74
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           8.77e-12
Time:                        18:52:56   Log-Likelihood:                -1407.5
No. Observations:                 526   AIC:                             2821.
Df Residuals:                     523   BIC:                             2834.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.7254      0.346     10.769      0.0

### Example6.2. Effects of pollution on housing prices

In [9]:
df = dataWoo('hprice2')
df['ldist'] = np.log(df['dist'])
df['roomsq'] = np.square(df['rooms'])
hprice_roomsq = smf.ols(formula='lprice ~ lnox + ldist + rooms + roomsq + stratio + 1', data=df).fit()
print(hprice_roomsq.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     151.8
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           7.89e-98
Time:                        18:52:56   Log-Likelihood:                -31.806
No. Observations:                 506   AIC:                             75.61
Df Residuals:                     500   BIC:                             101.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     13.3855      0.566     23.630      0.0

### Example6.3. Effects of attendance on final exam performance

In [10]:
df = dataWoo('attend')
df['priGPAsq'] = np.square(df['priGPA'])
df['ACTsq'] = np.square(df['ACT'])
df['priGPA_atndrte'] = df['priGPA']*df['atndrte']
attned_perf = smf.ols(formula='stndfnl ~ atndrte + priGPA + ACT + priGPAsq + ACTsq + priGPA_atndrte + 1', 
                      data=df).fit()
print(attned_perf.summary())

                            OLS Regression Results                            
Dep. Variable:                stndfnl   R-squared:                       0.229
Model:                            OLS   Adj. R-squared:                  0.222
Method:                 Least Squares   F-statistic:                     33.25
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           3.49e-35
Time:                        18:52:56   Log-Likelihood:                -868.90
No. Observations:                 680   AIC:                             1752.
Df Residuals:                     673   BIC:                             1783.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          2.0503      1.360      1.

### Example 6.4. CEO compensation and frim perfromance

In [11]:
df = dataWoo('ceosal1')

salary_lin = smf.ols(formula='salary ~ sales + roe + 1', data=df).fit()
salary_log = smf.ols(formula='lsalary ~ lsales + roe + 1', data=df).fit()

print(summary_col([salary_lin, salary_log],stars=True,float_format='%0.3f',
                  model_names=['salary_lin','salary_log'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared)}))



               salary_lin salary_log
------------------------------------
Intercept      830.631*** 4.362***  
               (223.905)  (0.294)   
R-squared      0.029      0.282     
R-squared Adj. 0.020      0.275     
lsales                    0.275***  
                          (0.033)   
roe            19.631*    0.018***  
               (11.077)   (0.004)   
sales          0.016*               
               (0.009)              
N              209        209       
R2             0.029      0.282     
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


### Example 6.5. Confidence interval for predicted college GPA

In [12]:
df = dataWoo('gpa2')
df['hsizesq'] = np.square(df['hsize'])

gpa_lin = smf.ols(formula='colgpa ~ sat + hsperc + hsize + hsizesq + 1', data=df).fit()
print(gpa_lin.summary())

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     398.0
Date:                Sun, 30 Jun 2024   Prob (F-statistic):          2.13e-290
Time:                        18:52:56   Log-Likelihood:                -3467.9
No. Observations:                4137   AIC:                             6946.
Df Residuals:                    4132   BIC:                             6978.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4927      0.075     19.812      0.0

In [13]:
df['sat0'] = df['sat']-1200
df['hsize0'] = df['hsize']-5
df['hsperc0'] = df['hsperc']-30
df['hsize0sq'] = np.square(df['hsize0'])

gpa_predict = smf.ols(formula='colgpa ~ sat0 + hsperc0 + hsize0 + hsize0sq + 1', data=df).fit()
print(gpa_predict.summary())

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     398.0
Date:                Sun, 30 Jun 2024   Prob (F-statistic):          2.13e-290
Time:                        18:52:56   Log-Likelihood:                -3467.9
No. Observations:                4137   AIC:                             6946.
Df Residuals:                    4132   BIC:                             6978.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7001      0.020    135.833      0.0

In [14]:
print(summary_col([gpa_lin, gpa_predict],stars=True,float_format='%0.3f',
                  model_names=['salary_lin','gpa_predict'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared)}))



               salary_lin gpa_predict
-------------------------------------
Intercept      1.493***   2.700***   
               (0.075)    (0.020)    
R-squared      0.278      0.278      
R-squared Adj. 0.277      0.277      
hsize          -0.061***             
               (0.017)               
hsize0                    -0.006     
                          (0.009)    
hsize0sq                  0.005**    
                          (0.002)    
hsizesq        0.005**               
               (0.002)               
hsperc         -0.014***             
               (0.001)               
hsperc0                   -0.014***  
                          (0.001)    
sat            0.001***              
               (0.000)               
sat0                      0.001***   
                          (0.000)    
N              4137       4137       
R2             0.278      0.278      
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


### Example 6.6. Confidence Interval for Future Collage GPA

In [15]:
gpa_lin = smf.ols(formula='colgpa ~ sat + hsperc + hsize + hsizesq + 1', data=df).fit()
print(gpa_lin.summary())

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     398.0
Date:                Sun, 30 Jun 2024   Prob (F-statistic):          2.13e-290
Time:                        18:52:56   Log-Likelihood:                -3467.9
No. Observations:                4137   AIC:                             6946.
Df Residuals:                    4132   BIC:                             6978.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4927      0.075     19.812      0.0

In [16]:
predicted_value= 1200*.0015 + 30 * -(.0139) + 5*-(.0609) + 5*5*.0055 + 1.4927
predicted_value

2.7087

### Example 6.7. Predicting CEO log(salary)

In [17]:
df = dataWoo('ceosal2')
ceo_step1 = smf.ols(formula='lsalary ~ lsales + lmktval + ceoten + 1', data=df).fit()
print(ceo_step1.summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.318
Model:                            OLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     26.91
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           2.47e-14
Time:                        18:52:56   Log-Likelihood:                -128.12
No. Observations:                 177   AIC:                             264.2
Df Residuals:                     173   BIC:                             276.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.5038      0.257     17.509      0.0

In [18]:
uhat = df.lsalary - ceo_step1.predict()
ehat=np.exp(uhat)
ehat.mean()

1.135661326663072

In [19]:
mhat=np.exp(ceo_step1.predict())
ceo_step2 = smf.ols(formula='salary ~ mhat + 0', data=df).fit()
ceo_step2.params #The coef. as in equation 46.44

mhat    1.116857
dtype: float64

In [20]:
ceo_step3= smf.ols(formula='lsalary ~ lsales + lmktval + ceoten + 1', data=df).fit()
ceo_step3.params

Intercept    4.503795
lsales       0.162854
lmktval      0.109243
ceoten       0.011705
dtype: float64

In [21]:
ceo_step3_pred = 4.5038 + .1629*np.log(5000) + .1092*np.log(10000) + .0117*10
ceo_step3_pred

7.014019939501504

In [22]:
ceo_step4 = smf.ols(formula='salary ~ mhat + 0', data=df).fit()
ceo_step4.params

mhat    1.116857
dtype: float64

In [23]:
ceo_step4_pred = 1.117*np.exp(7.013)
ceo_step4_pred

1240.9674054171805

### Example 6.8. Predicting CEO salary

In [24]:
corr = ss.pearsonr(df.salary, mhat)
print(corr) # Returns correlation coeficient and pvalue.

PearsonRResult(statistic=0.49303222976474653, pvalue=3.1363578178118685e-12)


In [25]:
ceo_sal= smf.ols(formula='salary ~ sales + mktval + ceoten + 1', data=df).fit()
print(ceo_sal.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                  0.187
Method:                 Least Squares   F-statistic:                     14.53
Date:                Sun, 30 Jun 2024   Prob (F-statistic):           1.74e-08
Time:                        18:52:57   Log-Likelihood:                -1359.3
No. Observations:                 177   AIC:                             2727.
Df Residuals:                     173   BIC:                             2739.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    613.4361     65.237      9.403      0.0