In [1]:
#importing libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.formula.api as smf

__Q1: Descriptive analysis__

__Q1.1: 1.1 Summary statistics__

In [2]:
#Read the data
data = pd.read_csv('progresa-sample.csv.bz2')

In [3]:
#Checking all the columns of the data
data.columns

Index(['year', 'sex', 'indig', 'dist_sec', 'sc', 'grc', 'fam_n', 'min_dist',
       'dist_cap', 'poor', 'progresa', 'hohedu', 'hohwag', 'welfare_index',
       'hohsex', 'hohage', 'age', 'village', 'folnum', 'grc97', 'sc97'],
      dtype='object')

In [4]:
#Validating the data
data.head()

Unnamed: 0,year,sex,indig,dist_sec,sc,grc,fam_n,min_dist,dist_cap,poor,...,hohedu,hohwag,welfare_index,hohsex,hohage,age,village,folnum,grc97,sc97
0,97,0.0,0.0,4.473,1.0,7.0,7,21.168384,21.168384,pobre,...,6,0.0,583.0,1.0,35.0,13,163,1,7,1.0
1,98,0.0,0.0,4.473,1.0,8.0,7,21.168384,21.168384,pobre,...,6,0.0,583.0,1.0,35.0,14,163,1,7,1.0
2,97,1.0,0.0,4.473,1.0,6.0,7,21.168384,21.168384,pobre,...,6,0.0,583.0,1.0,35.0,12,163,2,6,1.0
3,98,1.0,0.0,4.473,1.0,7.0,7,21.168384,21.168384,pobre,...,6,0.0,583.0,1.0,35.0,13,163,2,6,1.0
4,97,0.0,0.0,4.473,1.0,2.0,7,21.168384,21.168384,pobre,...,6,0.0,583.0,1.0,35.0,8,163,3,2,1.0


In [5]:
#Verifying the shape
data.shape

(77250, 21)

In [6]:
#Recoding the progresa variable
data['progresa'] = np.where(data['progresa'] == 'basal' ,1.0,0.0) 

In [7]:
#Checking the validity of the variable
data['progresa'].nunique()

2

In [8]:
#Dropping the null variables
data = data.dropna()

In [9]:
data['progresa'].nunique()

2

In [10]:
#Shape after dropping the null variables.
data.shape

(67122, 21)

_There were totally 10128 rows affected by null values_

In [11]:
#Putting all the necessary variables in the data to check the mean, median etc
data2 = pd.DataFrame(data[['sex', 'indig', 'dist_sec','sc','grc','fam_n', 'min_dist',
       'dist_cap', 'poor', 'progresa', 'hohedu', 'hohwag', 'welfare_index',
       'hohsex', 'hohage', 'age', 'grc97', 'sc97']].describe())

In [12]:
#Validating the data
data2.head()

Unnamed: 0,sex,indig,dist_sec,sc,grc,fam_n,min_dist,dist_cap,progresa,hohedu,hohwag,welfare_index,hohsex,hohage,age,grc97,sc97
count,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0
mean,0.513572,0.291305,2.34699,0.821102,4.089494,7.224666,103.595249,148.747707,0.618545,2.853908,592.782484,690.196267,0.927744,44.262224,11.141369,3.762656,0.839263
std,0.499819,0.454367,2.111281,0.38327,2.447608,2.329421,42.025512,77.134043,0.485747,2.655106,791.378223,139.489218,0.258914,11.457772,2.995374,2.449568,0.367291
min,0.0,0.0,0.0,0.0,0.0,1.0,9.465392,9.465392,0.0,0.0,0.0,180.0,0.0,15.0,6.0,0.0,0.0
25%,0.0,0.0,0.289,1.0,2.0,6.0,70.719487,91.69062,0.0,0.0,160.0,597.0,1.0,36.0,9.0,2.0,1.0


In [13]:
#Resetting index
data2 = data2.reset_index()

In [14]:
#Validating the data
data2.head()

Unnamed: 0,index,sex,indig,dist_sec,sc,grc,fam_n,min_dist,dist_cap,progresa,hohedu,hohwag,welfare_index,hohsex,hohage,age,grc97,sc97
0,count,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0,67122.0
1,mean,0.513572,0.291305,2.34699,0.821102,4.089494,7.224666,103.595249,148.747707,0.618545,2.853908,592.782484,690.196267,0.927744,44.262224,11.141369,3.762656,0.839263
2,std,0.499819,0.454367,2.111281,0.38327,2.447608,2.329421,42.025512,77.134043,0.485747,2.655106,791.378223,139.489218,0.258914,11.457772,2.995374,2.449568,0.367291
3,min,0.0,0.0,0.0,0.0,0.0,1.0,9.465392,9.465392,0.0,0.0,0.0,180.0,0.0,15.0,6.0,0.0,0.0
4,25%,0.0,0.0,0.289,1.0,2.0,6.0,70.719487,91.69062,0.0,0.0,160.0,597.0,1.0,36.0,9.0,2.0,1.0


In [15]:
#Pivoting the table
data2.pivot_table(columns=['index'])

index,25%,50%,75%,count,max,mean,min,std
age,9.0,11.0,14.0,67122.0,17.0,11.141369,6.0,2.995374
dist_cap,91.69062,132.540597,188.574177,67122.0,359.774457,148.747707,9.465392,77.134043
dist_sec,0.289,2.26,3.539,67122.0,14.879,2.34699,0.0,2.111281
fam_n,6.0,7.0,9.0,67122.0,24.0,7.224666,1.0,2.329421
grc,2.0,4.0,6.0,67122.0,14.0,4.089494,0.0,2.447608
grc97,2.0,4.0,6.0,67122.0,14.0,3.762656,0.0,2.449568
hohage,36.0,42.0,51.0,67122.0,98.0,44.262224,15.0,11.457772
hohedu,0.0,3.0,4.0,67122.0,20.0,2.853908,0.0,2.655106
hohsex,1.0,1.0,1.0,67122.0,1.0,0.927744,0.0,0.258914
hohwag,160.0,500.0,750.0,67122.0,14000.0,592.782484,0.0,791.378223


_All the variables are displayed in a neat tabular format_

_The variables appear in alphabetic order_

__1.2 : Differences at baseline?__

In [16]:
from scipy import stats
from scipy.stats import t

__Q1.2.1 and Q1.2.2__

In [17]:
#Reading the data again to make changes to the data and not copying it to the original
data2 = pd.read_csv('progresa-sample.csv.bz2')

#Mapping poor and progresa
data2['poor'] = data2['poor'].map({'pobre': 1, 'no pobre': 0})
data2['progresa'] = data2['progresa'].map({'basal' : 1, '0':0})

In [18]:
# Segregating into treatment and Control data
treatment_97 = pd.DataFrame(data2[(data2.year == 97) & (data2.poor == 1) & (data2.progresa == 1)])
control_97 = pd.DataFrame(data2[(data2.year == 97) & (data2.poor == 1) & (data2.progresa == 0)])

In [19]:
# Selecting rows where poor=1 and the year=97, and then grouping by 'progresa' column
new_data = data2[(data2.year == 97) & (data2.poor == 1)].groupby('progresa').mean()
new_data.drop(new_data.columns[[0,9,16,17]], axis =1,inplace=True)

new_data = new_data.transpose()

In [20]:
# swapping columns to match the structure of the required table
new_data = new_data[[1.0,0.0]]

# Resetting Index
new_data.reset_index(level=0, inplace=True)
new_data.rename(columns={'index' : 'Variable name', 0: 'Average value (Control villages)', 1: 'Average value (Treatment villages)'}, inplace=True)

# List of all Variables
var_list = list(new_data['Variable name'])

# Calculating T test for the Treatment, Control
tt = list(stats.ttest_ind(treatment_97[var_list], control_97[var_list], nan_policy='omit'))

# Adding the remaining two columns.
new_data['Difference (Treat - Control)'] = tt[0]
new_data['p-value'] = tt[1]

# for a better look at the insignificant data with respect to the value of p
new_data['p<0.05'] = new_data['p-value'] < 0.05

new_data.sort_values('Variable name')

progresa,Variable name,Average value (Treatment villages),Average value (Control villages),Difference (Treat - Control),p-value,p<0.05
13,age,10.716991,10.742023,-0.70863,0.4785594,False
7,dist_cap,150.829074,153.76973,-3.339081,0.0008415005,True
2,dist_sec,2.453122,2.507662,-2.100433,0.03569843,True
5,fam_n,7.281327,7.302469,-0.794167,0.4271039,False
4,grc,3.531599,3.54305,-0.400196,0.6890151,False
14,grc97,3.531599,3.54305,-0.400196,0.6890151,False
12,hohage,43.648828,44.276918,-4.775962,1.796243e-06,True
8,hohedu,2.663139,2.590348,2.541229,0.01105093,True
11,hohsex,0.924656,0.922947,0.566312,0.5711858,False
9,hohwag,544.339544,573.163558,-3.594588,0.0003253835,True


_Displayed above are all the means differences and p-values with Variable name in ascending order_

__Q1.2.3, Q1.2.4, Q1.2.5__

In [21]:
new_data[new_data['p<0.05']==True].sort_values('Variable name')

progresa,Variable name,Average value (Treatment villages),Average value (Control villages),Difference (Treat - Control),p-value,p<0.05
7,dist_cap,150.829074,153.76973,-3.339081,0.0008415005,True
2,dist_sec,2.453122,2.507662,-2.100433,0.03569843,True
12,hohage,43.648828,44.276918,-4.775962,1.796243e-06,True
8,hohedu,2.663139,2.590348,2.541229,0.01105093,True
9,hohwag,544.339544,573.163558,-3.594588,0.0003253835,True
6,min_dist,107.152915,103.237854,8.206584,2.358312e-16,True
0,sex,0.519317,0.505052,2.506686,0.01219172,True
10,welfare_index,655.428377,659.5791,-3.188594,0.001431016,True


_Q1.2.3 There are 8 variables shown above which are statistically significant between the control and the treatment variables_

_They are namely: dist_cap, dist_sec, hohage, hohwag, min_dist, sex, welfare_index_

_Q1.2.4 It matters that there are baseline differences because if the differences are too large, we can say that the data is biased thus making the causality of the progresa program weak_

_However, the differences are not too large for the statistically significant variables_

_Q1.2.5 The measurement of the impact cannot be acurately measured by the baseline differences alone, we need to explore the linear relationship between the control and treatments variables to correctly identify the impact of the measurement._

__Q2: Measuring Impact__

In [22]:
#Checking the shape
data.shape

(67122, 21)

In [23]:
#Taking only the poor data and creating a variable called after
newdata = data[data['poor'] == 'pobre']
newdata['after'] = np.where(newdata['year']==98, True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
#Validating the data
newdata.head()

Unnamed: 0,year,sex,indig,dist_sec,sc,grc,fam_n,min_dist,dist_cap,poor,...,hohwag,welfare_index,hohsex,hohage,age,village,folnum,grc97,sc97,after
0,97,0.0,0.0,4.473,1.0,7.0,7,21.168384,21.168384,pobre,...,0.0,583.0,1.0,35.0,13,163,1,7,1.0,False
1,98,0.0,0.0,4.473,1.0,8.0,7,21.168384,21.168384,pobre,...,0.0,583.0,1.0,35.0,14,163,1,7,1.0,True
2,97,1.0,0.0,4.473,1.0,6.0,7,21.168384,21.168384,pobre,...,0.0,583.0,1.0,35.0,12,163,2,6,1.0,False
3,98,1.0,0.0,4.473,1.0,7.0,7,21.168384,21.168384,pobre,...,0.0,583.0,1.0,35.0,13,163,2,6,1.0,True
4,97,0.0,0.0,4.473,1.0,2.0,7,21.168384,21.168384,pobre,...,0.0,583.0,1.0,35.0,8,163,3,2,1.0,False


In [25]:
#Checking the shape
newdata.shape

(56893, 22)

In [26]:
#Dropping NAs
newdata = newdata.dropna()

In [27]:
#Describing the data
newdata.after.describe()

count     56893
unique        2
top       False
freq      30738
Name: after, dtype: object

__Q2.1.1 compute the estimator by just comparing the average schooling rates for these villages.__

In [28]:
#Average schooling rates
newdata[newdata.progresa==1.0].groupby('after').sc.mean()

after
False    0.822697
True     0.849257
Name: sc, dtype: float64

_Displayed the average schooling rates:_

_The average schooling rate before 1998 was 82.2%  and the average schooling rate after 1998 was 84.92%_

__Q2.1.2 now re-compute the estimator using linear regression, and individual schooling rates. Do not include other regressors.__

In [29]:
#Linear model on sc and after
m = smf.ols(formula = 'sc~after', data=newdata[newdata.progresa == 1.0])
m.fit().summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,45.02
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,1.98e-11
Time:,10:49:14,Log-Likelihood:,-15111.0
No. Observations:,35355,AIC:,30230.0
Df Residuals:,35353,BIC:,30240.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8227,0.003,306.203,0.000,0.817,0.828
after[T.True],0.0266,0.004,6.709,0.000,0.019,0.034

0,1,2,3
Omnibus:,10153.494,Durbin-Watson:,1.388
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21422.127
Skew:,-1.801,Prob(JB):,0.0
Kurtosis:,4.252,Cond. No.,2.54


_Above displayed is the linear regression model of the school rates with the after variable_

_We can see that the estimate increases by 0.0266 when the after is true. The value for After=True is statistically significant_

_Therefore, The average schooling rate before 1998 was 82.27%  and the average schooling rate after 1998 was 84.87% which is comparable to the mean model_

__Q2.1.3 finally, estimate a multiple regression model that includes other covariates.__

In [30]:
#Multiple regression model on sc and other covariates
m = smf.ols(formula = 'sc~after + dist_sec + sex + min_dist + dist_cap + hohedu', data=newdata[newdata.progresa == 1.0])
m.fit().summary()
#r.summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,134.5
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,4.77e-169
Time:,10:49:16,Log-Likelihood:,-14734.0
No. Observations:,35355,AIC:,29480.0
Df Residuals:,35348,BIC:,29540.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7147,0.007,101.363,0.000,0.701,0.728
after[T.True],0.0255,0.004,6.499,0.000,0.018,0.033
dist_sec,-0.0068,0.001,-7.042,0.000,-0.009,-0.005
sex,0.0273,0.004,6.981,0.000,0.020,0.035
min_dist,0.0003,6.47e-05,4.537,0.000,0.000,0.000
dist_cap,0.0003,3.53e-05,7.259,0.000,0.000,0.000
hohedu,0.0146,0.001,18.704,0.000,0.013,0.016

0,1,2,3
Omnibus:,9695.85,Durbin-Watson:,1.408
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19839.8
Skew:,-1.739,Prob(JB):,0.0
Kurtosis:,4.174,Cond. No.,755.0


_In the multiple regression model we can see that after=True is	0.0255 which means that the individual schooling rates have increased by 0.0255 after 1998. This variable is slightly smaller as compared to the linear model without covariates._

_All the variables are statistically significant in the above model with p-value = 0.00. The t-value is the highest for the progresa variable. It seems to be much more significant compared to others_

_Therefore, The average schooling rate before 1998 was 71.47%  and the average schooling rate after 1998 was 73.97% which is comparable to the mean model_

__compare all the estimators. Are your estimates statistically significant? What do they suggest
about the efficacy of the progresa__

_The average enrollment rate before and after 1998 compares as follows:_

                    Before               After
                  
     Simple mean     82.2%               84.92%

    Linear model     82.27%              84.87%

    Multiple model   71.47%              73.97%

_In all the cases, we see that the enrollment rate has increased with time having the progresa effect. The simple mean model and the linear model are almost comparable. The multiple model is quite lesser because of the effect of all the variables._ 

_In all three cases, the values for After are statistically significant._

__2.2: Cross-sectional estimator__

In [31]:
#Validating the data
data.shape

(67122, 21)

In [32]:
#Selecting only the poor households
newdata = data[data['poor'] == 'pobre']
#Dropping NAs
newdata = newdata.dropna()
#Making an after variable
newdata['after'] = np.where(newdata['year']==98, True, False)
#Describing the after variable
newdata.after.describe()

count     56893
unique        2
top       False
freq      30738
Name: after, dtype: object

__Begin by estimating the impact of Progresa by compring the average enrollment rate among
poor households in the treatment villages and the average enrollment rate among poor households
in the control villages. What do you find?__

In [33]:
#Getting the average enrollment rate for treatment and control villages
newdata[newdata['after']==True].groupby('progresa').sc.mean()

progresa
0.0    0.810923
1.0    0.849257
Name: sc, dtype: float64

_We find that the average enrollment rate for the treatment villages(Progresa=1) is 84% whereas the average enrollment rate for control villages(Progresa=0) is 81%_

__Now repeat the estimator using simple regression.__

In [34]:
#Making a linear model
m = smf.ols(formula = 'sc ~ progresa', data=newdata)
r = m.fit()
r.summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,44.28
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,2.87e-11
Time:,10:49:20,Log-Likelihood:,-25434.0
No. Observations:,56893,AIC:,50870.0
Df Residuals:,56891,BIC:,50890.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8132,0.003,315.401,0.000,0.808,0.818
progresa,0.0218,0.003,6.655,0.000,0.015,0.028

0,1,2,3
Omnibus:,15154.174,Durbin-Watson:,1.388
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30457.736
Skew:,-1.724,Prob(JB):,0.0
Kurtosis:,3.978,Cond. No.,3.01


_From the above results, we can see that with the effect of progresa the average enrollment rate increased by 0.0218 that is 2.1%_

_Without the effect of progresa, the enrollment rate is 81.32%. Therefore with the effect of progresa, the enrollment rate is 83.42%_

__Third, use multiple regression to get the same estimate.__

In [35]:
#Using multiple regression model
m = smf.ols(formula = 'sc ~ progresa + dist_sec + sex + min_dist + dist_cap + hohedu', data=newdata)
r = m.fit()
r.summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,210.1
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,3.43e-266
Time:,10:49:21,Log-Likelihood:,-24832.0
No. Observations:,56893,AIC:,49680.0
Df Residuals:,56886,BIC:,49740.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7041,0.006,126.469,0.000,0.693,0.715
progresa,0.0193,0.003,5.950,0.000,0.013,0.026
dist_sec,-0.0066,0.001,-8.970,0.000,-0.008,-0.005
sex,0.0283,0.003,9.018,0.000,0.022,0.034
min_dist,0.0004,5.17e-05,7.263,0.000,0.000,0.000
dist_cap,0.0002,2.78e-05,7.342,0.000,0.000,0.000
hohedu,0.0150,0.001,23.815,0.000,0.014,0.016

0,1,2,3
Omnibus:,14470.751,Durbin-Watson:,1.408
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28273.425
Skew:,-1.665,Prob(JB):,0.0
Kurtosis:,3.915,Cond. No.,753.0


_In the multiple regression model, we can see that the impact of progresa is slightly lesser as compared to the linear model. The enrollment rate is increasing by 1.9% here compared to the 2.1% in the linear model._

_Without the effect of progresa or any other factors, the average enrollment according to the model above is 70.41%. 
Therefore, the enrollment rate for treatment villages is 72.34% with progresa_

_Many factors are afftecting this model there by diminishing the effect of progresa_


__Finally, as above, compare your three estimators. What do you find? Are the effects statisti-
cally significant?__

_The average enrollment rate for the treatment and control villages compares as follows:_

                  Treatment               Control
                  
     Simple mean     84%                   81%

    Linear model    83.5%                 81.32%

    Multiple model  72.34%                70.41%

_In all the cases, we see that the enrollment rate has increased with progresa. The simple mean model and the linear model are almost comparable. The multiple model is slightly lesser because of the effect of all the variables._ 

_In all three cases, the values for progresa are statistically significant_

__2.3: Differences-in-differences estimator__

In [36]:
#Validating the data
data.shape

(67122, 21)

In [37]:
#Selecting only the poor households
newdata = data[data['poor'] == 'pobre']
#Dropping NAs
newdata = newdata.dropna()
#Making an after variable
newdata['after'] = np.where(newdata['year']==98, True, False)
#Describing the after variable
newdata.after.describe()

count     56893
unique        2
top       False
freq      30738
Name: after, dtype: object

__Start with the simple table. However, DiD requires 4-way comparison. So compare the average
enrollment rate among poor households in the treatment villages and the average enrollment rate
among poor households in the control villages, both 1997 and 1998. What do you find?__

In [38]:
#Displaying the diff table
newdata.groupby(['progresa', 'after'], as_index = False).sc.mean()

Unnamed: 0,progresa,after,sc
0,0.0,False,0.815066
1,0.0,True,0.810923
2,1.0,False,0.822697
3,1.0,True,0.849257


_We found that:_

_The difference for the Control Sample(Progresa = 0) is:_

0.815066 - 0.810923 = 0.004143

_The difference for the Treatment Sameple(Progresa = 1) is:_

0.822697 - 0.849257 = -0.02656

_The estimate of the impact(Diff-in-Diff) is:_

0.004143 - (-0.02656) = 0.030703

_The difference for 97 is:_

0.815066 - 0.822697 = -0.007631

_The difference for 98 is:_

0.810923 - 0.849257 = -0.038334

_The estimate of the impact(Diff-in-Diff) is:_

-0.007631 - (-0.038334) = 0.030703


_This indicates that the increase in the enrollment rate can be credited to the progresa treatment_

_Without the progresa and time in place, the enrollment rate is 81.5% and with both of them it is, 84.57%

_In both cases, the estimate of the impact is 0.030703_

__2.3.2:Now repeat the estimator using simple regression.__

In [39]:
#Cross linear model of progresa and after
m = smf.ols(formula = 'sc ~ progresa * after', data=newdata)
r = m.fit()
r.summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,29.42
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,5.32e-19
Time:,10:49:26,Log-Likelihood:,-25412.0
No. Observations:,56893,AIC:,50830.0
Df Residuals:,56889,BIC:,50870.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8151,0.004,232.782,0.000,0.808,0.822
after[T.True],-0.0041,0.005,-0.801,0.423,-0.014,0.006
progresa,0.0076,0.004,1.717,0.086,-0.001,0.016
progresa:after[T.True],0.0307,0.007,4.680,0.000,0.018,0.044

0,1,2,3
Omnibus:,15130.097,Durbin-Watson:,1.392
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30379.093
Skew:,-1.722,Prob(JB):,0.0
Kurtosis:,3.976,Cond. No.,7.63


_The area of interest in the above regression model is progresa:after[T:True] whose coefficient is 0.0307 which is ths same as compared to the diff-in-diff calculated in the tabular format._

_From the above table:_

_The enrollment rate without the effect of progresa or time: 81.51%_

_The enrollment rate with progresa, without the effect of time: 82.27%_

_The enrollment rate in 1998, without the effect of progresa: 80.75_

_The enrollment rate with the effect of progressa and in 1998: 84.58%_

_Therefore, the enrollment is the highest for effect of progresa and 1998. These values are comparable to the tabular diff-in-diff values_

_We can say that estimate through difference in difference approach are more accurate than simple difference because control and treatment variables are considered together with the introduction of interaction term where as they are considered in silos for simple difference method._

__And as above, use multiple regression to get the same estimate.__

In [40]:
#Using multiple regression model
m = smf.ols(formula = 'sc ~ progresa * after + dist_sec + sex + min_dist + dist_cap + hohedu', data=newdata)
r = m.fit()
r.summary()

0,1,2,3
Dep. Variable:,sc,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,162.9
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,6.910000000000001e-273
Time:,10:49:27,Log-Likelihood:,-24812.0
No. Observations:,56893,AIC:,49640.0
Df Residuals:,56884,BIC:,49720.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7067,0.006,117.044,0.000,0.695,0.719
after[T.True],-0.0053,0.005,-1.029,0.303,-0.015,0.005
progresa,0.0052,0.004,1.176,0.240,-0.003,0.014
progresa:after[T.True],0.0307,0.006,4.735,0.000,0.018,0.043
dist_sec,-0.0066,0.001,-8.975,0.000,-0.008,-0.005
sex,0.0283,0.003,9.022,0.000,0.022,0.034
min_dist,0.0004,5.17e-05,7.269,0.000,0.000,0.000
dist_cap,0.0002,2.77e-05,7.320,0.000,0.000,0.000
hohedu,0.0150,0.001,23.786,0.000,0.014,0.016

0,1,2,3
Omnibus:,14446.583,Durbin-Watson:,1.412
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28197.959
Skew:,-1.663,Prob(JB):,0.0
Kurtosis:,3.913,Cond. No.,1170.0


_From the above table:_

_The enrollment rate without the effect of progresa or time: 70.67%_

_The enrollment rate with progresa, without the effect of time: 71.19%_

_The enrollment rate in 1998, without the effect of progresa: 70.14_

_The enrollment rate with the effect of progressa and in 1998: 73.74%_

_Therefore, the enrollment is the highest for effect of progresa and 1998. These values are quite lesser compared to the linear model and the tabular model._



__Finally, as above, compare your three estimators. What do you find? Are the effects statistically significant?__

_The average enrollment rate with progresa and time in diff-in-diff models compares as follows:_

                          Treatment               Control
                  
     Simple diff-in-diff     81.5%                   84.57%

     Linear model            81.51%                  84.58%

     Multiple model          70.68%                  73.74%

_In all the cases, we see that the enrollment rate has increased with progresa and time. The simple diff-in-diff model and the linear model are almost comparable. The multiple model is slightly lesser because of the effect of all the variables._ 

_In the linear and the multiple model, the after and progresa variable without affecting each other are not statistically significant_

_Whereas, after&1998 are statistically significant in both the cases_

__Q 2.4 Compare the estimators__

__List the identifying assumptions (counterfactual assumptions) behind all three models. Which ones do you find more/less plausible?__

_Counterfactual assumptions for all three models are as follows:_

_1. Before-and-after: The counterfactual argument would be that the avg. enrollment would be the same in 1997 and 1998 with or without the effect of progresa. Time will not be taken into consideration._

_2. Cross sectional estimator: The counterfactual argument would be that avg. enrollment would be the same with or without the effect of progresa. Therefore, The average difference in outcomes between Treated and Control group is solely due to the treatment and no other factor._

_3. Diff-in-Diff: The counterfactual argument would be that the avg. enrollment would be the same with or without progresa. It would also remain the same for 1997 and 1998._

_The counter factual assumption of Cross sectional Estimator and Before-After Estimator is the least plausible because the avg enrollment as seen from the above model has always slightly increased with the effect of progresa. But, It is also possible that there can be other trends or confounding variables which can increase the average enrollment in schools over a period of time and the average enrollement rate of the treatment villages could actually change even if progresa program wouldn't have occurred._ 

_However, this could be verified if we had a ontrol group which is not affected by progresa to compare the effect of progresa. This comparison helps us to eliminate the confounding effect after the treatment and arrive at a real casual impact because of progresa._

__Compare the estimates of all three models. Do your analysis suggest that progresa program had a positive impact on schooling rates?__

_From the analysis I observed the following -_

	
_There was a statistical difference between treatment and control groups. The division is not completely random. Hence, there is a flaw in our baseline and it makes our further analysis less reliable._
	
_According to Before-after estimator,Cross-sectional and Differences-in-differences estimator, we observed a positive impact of progresa on the average enrollment in school. We saw the highest impact of progresa in Difference in Difference estimator and Hence, it is the most reliable method to estimate the effect of progresa. This is because Difference in Difference relaxes the underlying assumption of Before-after and cross-sectional estimator and accurately calculates the average enrollment rate among poor households in the treatment villages and the average enrollment rate among poor households in the control villages, both 1997 and 1998._
	


 