# Chapter 7. Multiple Regression Analysis with Qualitative Information 
[Home](http://solomonegash.com/) | [Stata](http://solomonegash.com/woodridge1/index.html) | [R](http://solomonegash.com/econometrics/rbook1/index.html)


In [1]:
import numpy as np
import statsmodels.formula.api as smf

from wooldridge import *

### Example 7.1. Hourly wage equation

In [2]:
df = dataWoo('wage1')

In [3]:
wage_female = smf.ols(formula='wage ~ female + educ + exper + tenure + 1', data=df).fit()
print(wage_female.summary())
# b1 measures the average wage difference between men and women who have the same level of educ, exper and tenure.

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.359
Method:                 Least Squares   F-statistic:                     74.40
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           7.30e-50
Time:                        17:51:17   Log-Likelihood:                -1314.2
No. Observations:                 526   AIC:                             2638.
Df Residuals:                     521   BIC:                             2660.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.5679      0.725     -2.164      0.0

In [4]:
wage_female = smf.ols(formula='wage ~ female + 1', data=df).fit()
print(wage_female.summary())
# b0 is the average wage for men in the sample.

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.116
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     68.54
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.04e-15
Time:                        17:51:17   Log-Likelihood:                -1400.7
No. Observations:                 526   AIC:                             2805.
Df Residuals:                     524   BIC:                             2814.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.0995      0.210     33.806      0.0

In [5]:
#Mean wage for male and female
from pandas.api.types import CategoricalDtype
df = df.melt(['wage', 'educ','exper', 'tenure', 'female'])
df.groupby('female').mean(numeric_only=True) 

Unnamed: 0_level_0,wage,educ,exper,tenure,value
female,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7.099489,12.788321,17.558394,6.474453,32.003058
1,4.587659,12.31746,16.428571,3.615079,26.502281


### Example 7.2. Effect of computer ownership on collage GPA

In [6]:
df = dataWoo('gpa1')
colGPA_ur = smf.ols(formula='colGPA ~ PC + hsGPA + ACT + 1', data=df).fit()
print(colGPA_ur.summary())

                            OLS Regression Results                            
Dep. Variable:                 colGPA   R-squared:                       0.219
Model:                            OLS   Adj. R-squared:                  0.202
Method:                 Least Squares   F-statistic:                     12.83
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.93e-07
Time:                        17:51:17   Log-Likelihood:                -42.796
No. Observations:                 141   AIC:                             93.59
Df Residuals:                     137   BIC:                             105.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.2635      0.333      3.793      0.0

In [7]:
colGPA_r = smf.ols(formula='colGPA ~ PC + 1', data=df).fit()
print(colGPA_r.summary())

                            OLS Regression Results                            
Dep. Variable:                 colGPA   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     7.314
Date:                Tue, 02 Jul 2024   Prob (F-statistic):            0.00770
Time:                        17:51:17   Log-Likelihood:                -56.641
No. Observations:                 141   AIC:                             117.3
Df Residuals:                     139   BIC:                             123.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9894      0.040     75.678      0.0

In [8]:
df = df.melt(['PC', 'colGPA'])
df.groupby('PC').mean(numeric_only=True) 

Unnamed: 0_level_0,colGPA,value
PC,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.989412,2.256078
1,3.158929,2.250529


### Example 7.3. Effect of Training Grants on hours of training

In [9]:
df = dataWoo('jtrain')
df = df[(df['year']==1988)]
jobb_reg = smf.ols(formula='hrsemp ~ grant + lsales + lemploy + 1', data=df).fit()
print(jobb_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 hrsemp   R-squared:                       0.237
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     10.44
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           4.80e-06
Time:                        17:51:17   Log-Likelihood:                -482.29
No. Observations:                 105   AIC:                             972.6
Df Residuals:                     101   BIC:                             983.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     46.6651     43.412      1.075      0.2

### Example 7.4. Housing price regression

In [10]:
df = dataWoo('hprice1')
hrpice_reg = smf.ols(formula='lprice ~ llotsize + lsqrft  + bdrms + colonial + 1', data=df).fit()
print(hrpice_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.649
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     38.38
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           3.74e-18
Time:                        17:51:17   Log-Likelihood:                 26.619
No. Observations:                  88   AIC:                            -43.24
Df Residuals:                      83   BIC:                            -30.85
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.3496      0.651     -2.073      0.0

### Example7.5. Hourly wage equation

In [11]:
df = dataWoo('wage1')
wage_reg = smf.ols(formula='lwage ~ female + educ  + exper + expersq + tenure + tenursq + 1', data=df).fit()
print(wage_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     68.18
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           2.11e-62
Time:                        17:51:17   Log-Likelihood:                -260.59
No. Observations:                 526   AIC:                             535.2
Df Residuals:                     519   BIC:                             565.0
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4167      0.099      4.212      0.0

In [12]:
level = np.exp(-.2965) - 1 
level

-0.2565843727358956

### Example 7.6. Hourly wage equation 

In [13]:
df = dataWoo('wage1')
df = df.melt(['wage','lwage', 'educ','exper', 'expersq', 'tenure', 'tenursq', 'female', 'married' ])
wage_reg = smf.ols('lwage ~ female * married + educ  + exper + expersq + tenure + tenursq + 1', data=df).fit()
print(wage_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.460
Method:                 Least Squares   F-statistic:                     842.1
Date:                Tue, 02 Jul 2024   Prob (F-statistic):               0.00
Time:                        17:51:18   Log-Likelihood:                -3764.3
No. Observations:                7890   AIC:                             7547.
Df Residuals:                    7881   BIC:                             7609.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.3214      0.026     12.

### Example 7.7. Effects of physical attractiveness on wage

In [14]:
df = dataWoo('beauty')
df = df[(df['female']==0)]
wage_reg = smf.ols('lwage ~ belavg + abvavg + educ  + exper + expersq + union + married + black + south + goodhlth + 1', 
                   data=df).fit()
print(wage_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.255
Model:                            OLS   Adj. R-squared:                  0.246
Method:                 Least Squares   F-statistic:                     27.82
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           5.64e-46
Time:                        17:51:18   Log-Likelihood:                -540.02
No. Observations:                 824   AIC:                             1102.
Df Residuals:                     813   BIC:                             1154.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4778      0.120      3.970      0.0

In [15]:
df = dataWoo('beauty')
df = df[(df['female']==1)]
beauty_reg = smf.ols('lwage ~ belavg + abvavg + educ  + exper + expersq + union + married + black + south + goodhlth + 1',
                     data=df).fit()
print(beauty_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.261
Method:                 Least Squares   F-statistic:                     16.40
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           4.31e-25
Time:                        17:51:18   Log-Likelihood:                -265.56
No. Observations:                 436   AIC:                             553.1
Df Residuals:                     425   BIC:                             598.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0773      0.144     -0.536      0.5

### Example 7.8. Effects of law school rankings on starting salaries

In [16]:
df = dataWoo('lawsch85')

df['r61_100'] = 0
df.loc[(df['rank'] > 60) & (df['rank'] <= 100), 'r61_100'] = 1

r_reg = smf.ols(formula = 
                     'lsalary ~ top10 + r11_25  + r26_40 + r41_60 + r61_100 + LSAT + GPA + llibvol + lcost + 1', 
                     data=df).fit()
print(r_reg.summary())


                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.911
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     143.2
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           9.45e-62
Time:                        17:51:18   Log-Likelihood:                 146.45
No. Observations:                 136   AIC:                            -272.9
Df Residuals:                     126   BIC:                            -243.8
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      9.1653      0.411     22.277      0.0

In [17]:
hypotheses = '(LSAT = GPA = llibvol = lcost = 0)'
f_test = r_reg.f_test(hypotheses)
print(f_test)

<F test: F=2.3853161323547907, p=0.05470437645682231, df_denom=126, df_num=4>


In [18]:
display = np.exp(0.6996)-1
display

1.0129473674662273

### Example 7.9. Effects of computer usage on wages 
#### Data not available

### Example 7.10. Log hourly wage equation

In [19]:
df=dataWoo('wage1')
wage_reg = smf.ols('lwage ~ female*educ  + exper + expersq + tenure + tenursq + 1', data=df).fit()
print(wage_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     58.37
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.67e-61
Time:                        17:51:18   Log-Likelihood:                -260.49
No. Observations:                 526   AIC:                             537.0
Df Residuals:                     518   BIC:                             571.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.3888      0.119      3.276      

In [20]:
hypotheses = '(female:educ = female = 0)'
f_test = wage_reg.f_test(hypotheses)
print(f_test)

<F test: F=34.32554911448261, p=1.0023439572216499e-14, df_denom=518, df_num=2>


### Example 7.11. Effects of race on baseball player salaries

In [21]:
df=dataWoo('mlb1')
df = df[(df['percblck']!=0)]
mlb1_reg = smf.ols('lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr + runsyr + fldperc + allstar + black + hispan + black:percblck + hispan:perchisp  + 1', data=df).fit()
print(mlb1_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.624
Method:                 Least Squares   F-statistic:                     46.48
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.31e-62
Time:                        17:51:18   Log-Likelihood:                -350.12
No. Observations:                 330   AIC:                             726.2
Df Residuals:                     317   BIC:                             775.6
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          10.3437      2.183     

In [22]:
hypotheses = '(black = hispan = black:percblck = hispan:perchisp = 0)'
f_test = mlb1_reg.f_test(hypotheses)
print(f_test)

<F test: F=2.647888304563965, p=0.033476147356758516, df_denom=317, df_num=4>


In [23]:
df=dataWoo('mlb1')
df = df[(df['percblck']!=0)]
mlb1_reg_r = smf.ols('lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr + runsyr + fldperc + allstar + 1', data=df).fit()
print(mlb1_reg_r.summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.636
Method:                 Least Squares   F-statistic:                     77.94
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.35e-72
Time:                        17:51:18   Log-Likelihood:                -377.03
No. Observations:                 353   AIC:                             772.1
Df Residuals:                     344   BIC:                             806.9
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.3277      2.002      5.159      0.0

### Equation [7.22]  (Page 222)

In [24]:
df=dataWoo('gpa3')
df = df[(df['spring']==1)]
gpa3_reg = smf.ols('cumgpa ~ female*sat + hsperc + female:hsperc + tothrs + female:tothrs + 1', data=df).fit()
print(gpa3_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 cumgpa   R-squared:                       0.406
Model:                            OLS   Adj. R-squared:                  0.394
Method:                 Least Squares   F-statistic:                     34.95
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           4.66e-37
Time:                        17:51:18   Log-Likelihood:                -237.26
No. Observations:                 366   AIC:                             490.5
Df Residuals:                     358   BIC:                             521.7
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         1.4808      0.207      7.142

In [25]:
hypotheses = '(female:tothrs = female:sat = female:hsperc = 0)'
f_test = gpa3_reg.f_test(hypotheses)
print(f_test)

<F test: F=1.5338978108616452, p=0.20537335628140385, df_denom=358, df_num=3>


### Equation [7.25] (Page 224)

In [26]:
df=dataWoo('gpa3')
df = df[(df['spring']==1)]
gpa3_reg_r = smf.ols('cumgpa ~ female + sat + hsperc + tothrs + 1', data=df).fit()
print(gpa3_reg_r.summary())

                            OLS Regression Results                            
Dep. Variable:                 cumgpa   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.392
Method:                 Least Squares   F-statistic:                     59.74
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.10e-38
Time:                        17:51:18   Log-Likelihood:                -239.59
No. Observations:                 366   AIC:                             489.2
Df Residuals:                     361   BIC:                             508.7
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3285      0.180      7.388      0.0

### Equation [7.29] (Page 225)

In [27]:
df = dataWoo('mroz')
mroz_reg = smf.ols('inlf ~ nwifeinc + educ + exper + expersq + age + kidslt6 + kidsge6 + 1', data=df).fit()
print(mroz_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                   inlf   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.257
Method:                 Least Squares   F-statistic:                     38.22
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           6.90e-46
Time:                        17:51:18   Log-Likelihood:                -423.89
No. Observations:                 753   AIC:                             863.8
Df Residuals:                     745   BIC:                             900.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5855      0.154      3.798      0.0

### Example7.12. A linear probability model of arrest 

In [28]:
df = dataWoo('crime1')

df['arr86'] = 0
df.loc[(df['narr86'] > 0), 'arr86'] = 1

crime_reg = smf.ols(' df.arr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 + 1', data=df).fit()
print(crime_reg.summary())


                            OLS Regression Results                            
Dep. Variable:               df.arr86   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.046
Method:                 Least Squares   F-statistic:                     27.03
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           9.09e-27
Time:                        17:51:18   Log-Likelihood:                -1609.7
No. Observations:                2725   AIC:                             3231.
Df Residuals:                    2719   BIC:                             3267.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4406      0.017     25.568      0.0

In [29]:
hypotheses = '(avgsen = tottime = 0)'
f_test = crime_reg.f_test(hypotheses)
print(f_test)

<F test: F=1.0597004440363245, p=0.346702695391494, df_denom=2.72e+03, df_num=2>


### Equation [7.32] (Page 228)

In [30]:
crime_reg_2 = smf.ols(' df.arr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 + black + hispan + 1', data=df).fit()
print(crime_reg_2.summary())

                            OLS Regression Results                            
Dep. Variable:               df.arr86   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     28.41
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           5.46e-38
Time:                        17:51:18   Log-Likelihood:                -1579.6
No. Observations:                2725   AIC:                             3175.
Df Residuals:                    2717   BIC:                             3222.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3804      0.019     20.314      0.0

### Equation [7.33] (Page 229)

In [31]:
df = dataWoo('jtrain')
df = df[(df['year']==1988)]
jobb_reg = smf.ols(' lscrap ~ grant + lsales + lemploy + 1', data=df).fit()
print(jobb_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 lscrap   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.182
Date:                Tue, 02 Jul 2024   Prob (F-statistic):              0.327
Time:                        17:51:18   Log-Likelihood:                -85.161
No. Observations:                  50   AIC:                             178.3
Df Residuals:                      46   BIC:                             186.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.9868      4.656      1.071      0.2

### Equation [7.35] (Page 231)

In [32]:
df = dataWoo('fertil2')
fert_reg = smf.ols(' children ~ age + educ + 1', data=df).fit()
print(fert_reg.summary())

                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.560
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     2768.
Date:                Tue, 02 Jul 2024   Prob (F-statistic):               0.00
Time:                        17:51:18   Log-Likelihood:                -7881.7
No. Observations:                4361   AIC:                         1.577e+04
Df Residuals:                    4358   BIC:                         1.579e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.9967      0.094    -21.249      0.0

### Equation [7.37] (Page 232)

In [33]:
df = dataWoo('fertil2')
fert_reg_2 = smf.ols(' children ~ age + educ + electric + 1', data=df).fit()
print(fert_reg_2.summary())

                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1863.
Date:                Tue, 02 Jul 2024   Prob (F-statistic):               0.00
Time:                        17:51:18   Log-Likelihood:                -7863.3
No. Observations:                4358   AIC:                         1.573e+04
Df Residuals:                    4354   BIC:                         1.576e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.0711      0.095    -21.861      0.0