In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import ttest_1samp
from linearmodels import PanelOLS
from scipy.stats.mstats import zscore

In [2]:
# Dataset
df = pd.read_stata('student_test_data.dta')
df.describe()

Unnamed: 0,pupilid,schoolid,bungoma,tracking,sbm,girl,agetest,etpteacher,lowstream,stream_meanpercentile,...,r2_spellscore,r2_sentscore24,r2_letterscore24,r2_spellscore24,r2_litscore,r2_mathscoreraw,r2_additions_score,r2_substractions_score,r2_multiplications_score,r2_totalscore
count,7022.0,7022.0,7022.0,7022.0,7022.0,6995.0,6500.0,7022.0,7022.0,6462.0,...,5495.0,5495.0,5494.0,5495.0,5495.0,5489.0,5495.0,5495.0,5495.0,5489.0
mean,7831203.0,783.000977,0.241811,0.514526,0.508402,0.495354,9.274769,0.496297,0.257477,50.640678,...,3.703367,5.827371,11.951511,8.88808,9.106643,10.750592,5.112648,4.287352,1.338853,19.864565
std,1714448.0,171.44278,0.428211,0.499825,0.499965,0.500014,1.470396,0.500022,0.437275,18.528595,...,3.610104,6.967761,8.223339,8.66425,7.301884,5.204099,2.376443,2.268766,1.433964,11.312814
min,4301001.0,430.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,23.811319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6921029.0,692.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,26.941175,...,0.0,0.0,4.457143,0.0,2.485714,7.0,4.0,3.0,0.0,10.7
50%,7951019.0,795.0,0.0,1.0,1.0,0.0,9.0,0.0,0.0,51.094471,...,3.0,3.0,12.0,7.2,6.7,11.0,5.0,5.0,1.0,18.514286
75%,9381056.0,938.0,0.0,1.0,1.0,1.0,10.0,1.0,1.0,73.787064,...,7.0,9.6,19.542856,16.799999,16.014286,15.0,7.0,6.0,3.0,29.700001
max,10201060.0,1020.0,1.0,1.0,1.0,1.0,19.0,1.0,1.0,76.718063,...,10.0,24.0,24.0,24.0,24.0,24.0,9.0,9.0,6.0,46.099998


In [3]:
df.head()

Unnamed: 0,pupilid,schoolid,district,bungoma,division,zone,tracking,sbm,girl,agetest,...,r2_spellscore,r2_sentscore24,r2_letterscore24,r2_spellscore24,r2_litscore,r2_mathscoreraw,r2_additions_score,r2_substractions_score,r2_multiplications_score,r2_totalscore
0,4301001,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,,...,0.0,4.8,0.0,0.0,2.2,5.0,3.0,2.0,0.0,7.2
1,4301002,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,1.0,12.0,...,,,,,,,,,,
2,4301003,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,1.0,8.0,...,0.0,4.2,0.0,0.0,1.05,2.0,1.0,1.0,0.0,3.05
3,4301004,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,14.0,...,,,,,,,,,,
4,4301005,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,11.0,...,,,,,,,,,,


In [4]:
# 7a
df_reduced =  df.dropna(subset = ['schoolid', 'tracking', 'totalscore'])
print(smf.ols(formula = 'zscore(totalscore) ~ C(tracking)', data = df_reduced).fit(cov_type='cluster', cov_kwds={'groups': df_reduced['schoolid']}).summary())

                            OLS Regression Results                            
Dep. Variable:     zscore(totalscore)   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     3.197
Date:                Tue, 03 Dec 2019   Prob (F-statistic):             0.0763
Time:                        18:41:16   Log-Likelihood:                -8208.9
No. Observations:                5795   AIC:                         1.642e+04
Df Residuals:                    5793   BIC:                         1.644e+04
Df Model:                           1                                         
Covariance Type:              cluster                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             -0.0710      0

In [5]:
#7a
df_reduced =  df.dropna(subset = ['schoolid', 'tracking', 'totalscore', 'agetest', 'etpteacher', 'bottomhalf', 'bottomquarter', 'secondquarter', 'topquarter', 'girl', 'percentile'])
print(smf.ols(formula = 'zscore(totalscore) ~ C(tracking) + agetest + C(etpteacher) + C(girl) + percentile + C(bottomhalf) * C(tracking) + C(bottomquarter) * C(tracking) + C(secondquarter) * C(tracking) + C(topquarter) * C(tracking)', data = df_reduced).fit(cov_type='cluster', cov_kwds={'groups': df_reduced['schoolid']}).summary())

                            OLS Regression Results                            
Dep. Variable:     zscore(totalscore)   R-squared:                       0.254
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     51.85
Date:                Tue, 03 Dec 2019   Prob (F-statistic):           9.65e-41
Time:                        18:41:17   Log-Likelihood:                -6705.8
No. Observations:                5269   AIC:                         1.344e+04
Df Residuals:                    5255   BIC:                         1.353e+04
Df Model:                          13                                         
Covariance Type:              cluster                                         
                                                 coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------