### 2024-05-07 Categorical variables

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
ih = pd.read_csv("income-height.csv.bz2", sep = "\t")
ih.head(3)

Unnamed: 0,income,height,weight,age,marital,sex,education,afqt
0,19000,60,155.0,53,married,female,13.0,6.841
1,35000,70,156.0,51,married,female,10.0,49.444
2,105000,65,195.0,52,married,male,16.0,99.393


In [4]:
ih.sex.value_counts()

sex
female    3604
male      3402
Name: count, dtype: int64

In [5]:
ih.groupby("sex").income.mean()

sex
female    29587.564095
male      53510.055850
Name: income, dtype: float64

In [6]:
ih["female"] = (ih.sex == "female").astype(int)
ih.head(3)

Unnamed: 0,income,height,weight,age,marital,sex,education,afqt,female
0,19000,60,155.0,53,married,female,13.0,6.841,1
1,35000,70,156.0,51,married,female,10.0,49.444,1
2,105000,65,195.0,52,married,male,16.0,99.393,0


In [7]:
import statsmodels.formula.api as smf

In [8]:
m = smf.ols("income ~ female", data = ih).fit()
m.summary()

0,1,2,3
Dep. Variable:,income,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,335.9
Date:,"Tue, 07 May 2024",Prob (F-statistic):,2.5000000000000002e-73
Time:,14:59:41,Log-Likelihood:,-86360.0
No. Observations:,7006,AIC:,172700.0
Df Residuals:,7004,BIC:,172700.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.351e+04,936.142,57.160,0.000,5.17e+04,5.53e+04
female,-2.392e+04,1305.222,-18.328,0.000,-2.65e+04,-2.14e+04

0,1,2,3
Omnibus:,4908.306,Durbin-Watson:,1.731
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73316.159
Skew:,3.259,Prob(JB):,0.0
Kurtosis:,17.446,Cond. No.,2.65


In [9]:
m = smf.ols("income ~ sex", data = ih).fit()
m.summary()

0,1,2,3
Dep. Variable:,income,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,335.9
Date:,"Tue, 07 May 2024",Prob (F-statistic):,2.5000000000000002e-73
Time:,14:59:41,Log-Likelihood:,-86360.0
No. Observations:,7006,AIC:,172700.0
Df Residuals:,7004,BIC:,172700.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.959e+04,909.529,32.531,0.000,2.78e+04,3.14e+04
sex[T.male],2.392e+04,1305.222,18.328,0.000,2.14e+04,2.65e+04

0,1,2,3
Omnibus:,4908.306,Durbin-Watson:,1.731
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73316.159
Skew:,3.259,Prob(JB):,0.0
Kurtosis:,17.446,Cond. No.,2.59


In [10]:
ih.groupby("sex").income.agg(lambda x: np.mean(x == 0))

sex
female    0.276082
male      0.218989
Name: income, dtype: float64

In [11]:
ih.marital.value_counts()

marital
married      3806
divorced     1549
single       1124
separated     366
widowed       161
Name: count, dtype: int64

In [12]:
ih.groupby("marital").income.mean()

marital
divorced     33441.393802
married      51273.746978
separated    21276.453552
single       26719.613879
widowed      24261.881988
Name: income, dtype: float64

In [13]:
ih.groupby("marital").income.mean()

marital
divorced     33441.393802
married      51273.746978
separated    21276.453552
single       26719.613879
widowed      24261.881988
Name: income, dtype: float64

In [14]:
m = smf.ols("income ~ marital", data = ih).fit()
m.summary()

0,1,2,3
Dep. Variable:,income,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.041
Method:,Least Squares,F-statistic:,75.65
Date:,"Tue, 07 May 2024",Prob (F-statistic):,6.88e-63
Time:,15:01:00,Log-Likelihood:,-86376.0
No. Observations:,7006,AIC:,172800.0
Df Residuals:,7001,BIC:,172800.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.344e+04,1390.787,24.045,0.000,3.07e+04,3.62e+04
marital[T.married],1.783e+04,1649.704,10.809,0.000,1.46e+04,2.11e+04
marital[T.separated],-1.216e+04,3181.299,-3.824,0.000,-1.84e+04,-5928.631
marital[T.single],-6721.7799,2144.751,-3.134,0.002,-1.09e+04,-2517.418
marital[T.widowed],-9179.5118,4532.583,-2.025,0.043,-1.81e+04,-294.277

0,1,2,3
Omnibus:,5063.277,Durbin-Watson:,1.784
Prob(Omnibus):,0.0,Jarque-Bera (JB):,80910.389
Skew:,3.381,Prob(JB):,0.0
Kurtosis:,18.213,Cond. No.,8.41
