In [3]:
import pandas as pd

In [4]:
cpa_data = pd.read_excel('cpa-simulations.xlsx')
cpa_data

Unnamed: 0,CPA,skill,earnings
0,1,1,96980.64
1,0,1,66755.36
2,1,1,94581.53
3,1,1,91481.91
4,0,1,73923.35
...,...,...,...
4995,0,0,50389.67
4996,1,0,84060.51
4997,0,0,45083.38
4998,0,0,58762.40


### Was ist der Effekt eines WP-Examens auf das Einkommen?

**Alternative 1:** Vergleich des durchschnittlichen Einkommens von WPs mit nicht WPs

In [13]:
mean_cpa        = cpa_data[cpa_data['CPA'] == 1]['earnings'].mean()
mean_non_cpa    = cpa_data[cpa_data['CPA'] == 0]['earnings'].mean()

mean_cpa, mean_non_cpa, mean_cpa-mean_non_cpa

(90467.48747603834, 54362.88702323718, 36104.60045280116)

**Alternative 2:** Berechnung eines linearen Regressionsmodells.

In [6]:
import statsmodels.api as sm

X = cpa_data['CPA']
Y = cpa_data['earnings']

model   = sm.OLS(Y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               earnings   R-squared:                       0.689
Model:                            OLS   Adj. R-squared:                  0.689
Method:                 Least Squares   F-statistic:                 1.105e+04
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        11:27:57   Log-Likelihood:                -54115.
No. Observations:                5000   AIC:                         1.082e+05
Df Residuals:                    4998   BIC:                         1.082e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.436e+04    243.006    223.710      0.0

In [22]:
results.params

const    49947.843393
CPA      30347.939562
skill    14595.958809
dtype: float64

In [16]:
results.params['CPA']

36104.60045280153

### Was ist der kausale Effekt eines WP-Examens auf das Einkommen?

**Alternative 1:** Vergleich des durchschnittlichen Einkommens von WPs mit nicht WPs, die gleiche Fähigkeiten besitzen. Der kausale Effekt gibt sich dann als eine gewichtete Summe der Mittelwert-Differenzen.

In [17]:
# Vergleich der Accountants mit hohen Fähigkeiten (`skill==1`)
mean_cpa_skilled        = cpa_data[(cpa_data['CPA'] == 1) & (cpa_data['skill'] == 1)]['earnings'].mean()
mean_non_cpa_skilled    = cpa_data[(cpa_data['CPA'] == 0) & (cpa_data['skill'] == 1)]['earnings'].mean()
weight_skilled          = len(cpa_data[cpa_data['skill'] == 1]) / len(cpa_data)

mean_cpa_skilled, mean_non_cpa_skilled, weight_skilled

(94903.57060171918, 64516.46270198676, 0.5)

In [18]:
# Vergleich der Accountants mit geringen Fähigkeiten (`skill==0`)
mean_cpa_unskilled      = cpa_data[(cpa_data['CPA'] == 1) & (cpa_data['skill'] == 0)]['earnings'].mean()
mean_non_cpa_unskilled  = cpa_data[(cpa_data['CPA'] == 0) & (cpa_data['skill'] == 0)]['earnings'].mean()
weight_unskilled        = len(cpa_data[cpa_data['skill'] == 0]) / len(cpa_data)

mean_cpa_unskilled, mean_non_cpa_unskilled, weight_unskilled

(80268.58753623188, 49959.699408386, 0.5)

In [19]:
# Ermittlung des kausalen Effektes durch Gewichtung der Gruppen 
(mean_cpa_skilled - mean_non_cpa_skilled) * weight_skilled + \
    (mean_cpa_unskilled - mean_non_cpa_unskilled) * weight_unskilled

30347.99801378915

**Alternative 2:** Berechnung eines multiplen linearen Regressionsmodells (unter Berücksichtigung von Kontrollvariablen).

In [20]:
import statsmodels.api as sm

X = cpa_data[['CPA', 'skill']]
Y = cpa_data['earnings']

model   = sm.OLS(Y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               earnings   R-squared:                       0.784
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     9052.
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        11:39:21   Log-Likelihood:                -53205.
No. Observations:                5000   AIC:                         1.064e+05
Df Residuals:                    4997   BIC:                         1.064e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.995e+04    223.409    223.571      0.0

In [21]:
results.params['CPA']

30347.9395615021

### Was ist der Effekt der individuellen Fähigkeiten auf das Einkommen?

**Alternative 1:** Vergleich des durchschnittlichen Einkommens von Accountants mit hohen und geringen Fähigkeiten.

In [23]:
mean_skilled    = cpa_data[cpa_data['skill'] == 1]['earnings'].mean()
mean_unskilled  = cpa_data[cpa_data['skill'] == 0]['earnings'].mean()

mean_skilled, mean_unskilled, mean_skilled-mean_unskilled

(85726.664016, 59161.47784400001, 26565.186171999987)

**Alternative 2:** Berechnung eines linearen Regressionsmodells.

In [25]:
import statsmodels.api as sm

X = cpa_data['skill']
Y = cpa_data['earnings']

model   = sm.OLS(Y, sm.add_constant(X))
results = model.fit()

print(results.summary())

<statsmodels.regression.linear_model.OLS object at 0x0000024D5ED39190>
                            OLS Regression Results                            
Dep. Variable:               earnings   R-squared:                       0.373
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     2971.
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        11:43:06   Log-Likelihood:                -55866.
No. Observations:                5000   AIC:                         1.117e+05
Df Residuals:                    4998   BIC:                         1.117e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

In [24]:
results.params['CPA']

30347.9395615021

### Was ist der direkte Effekt der individuellen Fähigkeiten auf das Einkommen?

**Alternative 1:** Vergleich des durchschnittlichen Einkommens von Accountants mit hohen versus geringen Fähigkeiten, die den gleiche Abschluss besitzen. Der direkte Effekt gibt sich dann als eine gewichtete Summe der Mittelwert-Differenzen.

In [21]:
# Vergleich der Accountants mit WP (`CPA==1`)
mean_skilled_cpa        = cpa_data[(cpa_data['skill'] == 1) & (cpa_data['CPA'] == 1)]['earnings'].mean()
mean_skilled_non_cpa    = cpa_data[(cpa_data['skill'] == 0) & (cpa_data['CPA'] == 1)]['earnings'].mean()
weight_cpa              = len(cpa_data[cpa_data['CPA'] == 1]) / len(cpa_data)

# Vergleich der Accountants ohne WP (`CPA==0`)
mean_unskilled_cpa      = cpa_data[(cpa_data['skill'] == 1) & (cpa_data['CPA'] == 0)]['earnings'].mean()
mean_unskilled_non_cpa  = cpa_data[(cpa_data['skill'] == 0) & (cpa_data['CPA'] == 0)]['earnings'].mean()
weight_non_cpa          = len(cpa_data[cpa_data['CPA'] == 0]) / len(cpa_data)

# Ermittlung des kausalen Effektes durch Gewichtung der Gruppen 
(mean_skilled_cpa - mean_skilled_non_cpa) * weight_cpa + \
    (mean_unskilled_cpa - mean_unskilled_non_cpa) * weight_non_cpa

14595.935755361545

**Alternative 2:** Berechnung eines multiplen linearen Regressionsmodells (unter Berücksichtigung von Kontrollvariablen).

In [20]:
import statsmodels.api as sm

X = cpa_data[['skill', 'CPA']]
Y = cpa_data['earnings']

model   = sm.OLS(Y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               earnings   R-squared:                       0.784
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     9052.
Date:                Wed, 06 Apr 2022   Prob (F-statistic):               0.00
Time:                        13:58:30   Log-Likelihood:                -53205.
No. Observations:                5000   AIC:                         1.064e+05
Df Residuals:                    4997   BIC:                         1.064e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.995e+04    223.409    223.571      0.0