In [1]:
import pandas as pd

In [2]:
df2 = pd.read_stata("../00_source_data/1415_sec_c.dta", convert_categoricals=False)

In [3]:
df2["hhcode"] = df2["hhcode"].astype("str")

# cleaning admitted & enrolled
df2["ever_admitted"] = "no"
df2.loc[(df2["scq03"] == 1), "ever_admitted"] = "yes"

df2["currently_enrolled"] = "no"
df2.loc[(df2["scq05"] == 1), "currently_enrolled"] = "yes"

df2["max_level_achieved"] = df2["scq04"]

In [4]:
# Reduce to the questions of interest (4 questions):
df2_rel = df2[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "currently_enrolled",
        "region",
        "province"
    ]
].copy()

In [5]:
df20 = pd.read_stata("../00_source_data/1415_sec_b.dta")

In [6]:
df20.sample(10)

Unnamed: 0,hhcode,psu,province,region,district,sec,idc,sbq02,sbq03,sbq04,sbq05,sbq61,sbq62,sbq63,age,sbq07,sbq08,sbq09,sbq10,sbq11
413718,3411000000.0,34110002,sindh,rural,sanghar,00B,1,head,,male,present,1986,0,0,28,currently married,2.0,99,99,yes
435523,3522100000.0,35221001,sindh,urban,karachi,00B,2,spouse,main economic provider,female,present,1971,0,0,43,currently married,1.0,98,99,yes
399829,3371001000.0,33710005,sindh,rural,badin,00B,3,son/daughter,main economic provider,female,present,2014,12,7,0,unmarried/never married,,1,2,yes
131592,2241003000.0,22410026,punjab,rural,mianwali,00B,4,son/daughter-in-law,family elder,female,present,1993,4,7,21,currently married,3.0,99,99,yes
29635,1172000000.0,11720001,kp,urban,malakand,00B,3,son/daughter,main economic provider,female,present,2006,0,0,8,unmarried/never married,,1,2,yes
474944,4332000000.0,43320001,balochistan,urban,ziarat,00B,3,son/daughter,main economic provider,female,present,2002,0,0,12,unmarried/never married,,1,2,yes
120460,2221005000.0,22210051,punjab,rural,bhakkar,00B,4,son/daughter,main economic provider,male,present,2011,11,14,3,unmarried/never married,,1,2,yes
442201,4121002000.0,41210019,balochistan,rural,pishin,00B,2,spouse,main economic provider,female,present,1962,0,0,52,currently married,1.0,98,98,yes
245094,2741007000.0,27410069,punjab,rural,khanewal,00B,5,son/daughter,family elder,male,temporarily absent at the time of enumeration,1995,0,0,19,unmarried/never married,,1,2,yes
400008,3371001000.0,33710006,sindh,rural,badin,00B,5,son/daughter,main economic provider,male,present,2007,0,0,7,unmarried/never married,,1,2,yes


In [7]:
df20["hhcode"] = df20["hhcode"].astype("str")

df20['sex'] = df20['sbq04']
df20['marital_status'] = df20['sbq07']
df20['subprovince'] = df20['district']

In [8]:
df20_rel = df20[['hhcode', 'age','idc', 'sex', 'marital_status', 'subprovince']].copy()

In [9]:
years14_15 = pd.merge(df2_rel, df20_rel, on=["hhcode", "idc"], how='inner', indicator=False)

In [10]:
years14_15 = years14_15.loc[(years14_15["age"] > 4)&(years14_15["age"] <= 10)]

years14_15.loc[(years14_15["region"] == 1), 'region'] = 'urban'
years14_15.loc[(years14_15["region"] == 2), 'region'] = 'rural'

years14_15["subprovince code"] = years14_15["hhcode"].apply(lambda x: x[0:4])
years14_15["subprovince code"] = years14_15["subprovince code"].astype("int")

years14_15["province"] = years14_15["province"].astype("int")

years14_15["province"] = years14_15["province"].map(
    {1: "K.P.K", 2: "Punjab", 3: "Sindh", 4: "Balochistan"}
)

# modifying marital status to align with desired output
years14_15["marital_status"].replace(
    [
        "unmarried/never married",
        "currently married",
        "widow/widower",
        "divorced",
        "nikah has been solemnised but the rukhsati has  not taken place",
    ],
    [1, 2, 3, 4, 5],
    inplace=True,
)

years14_15.subprovince = years14_15.subprovince.str.capitalize()

years14_15['year'] = 2014

In [11]:
years14_15.sample(10)

Unnamed: 0,hhcode,idc,ever_admitted,currently_enrolled,region,province,age,sex,marital_status,subprovince,subprovince code,year
295098,3121004208.0,3,no,no,urban,Sindh,7,male,1,Kashmore,3121,2014
243507,2841003511.0,4,yes,yes,urban,Punjab,10,female,1,Muzaffargarh,2841,2014
391333,4111000409.0,3,yes,yes,urban,Balochistan,10,male,1,Quetta,4111,2014
86996,2111002505.0,4,yes,yes,urban,Punjab,10,male,1,Attock,2111,2014
456140,4622000311.0,4,yes,yes,rural,Balochistan,8,male,1,Gwadar,4622,2014
302219,3132000903.0,3,yes,yes,rural,Sindh,8,female,1,Shikarpur,3132,2014
287789,3111003615.0,9,no,no,urban,Sindh,7,male,1,Jacobabad,3111,2014
322828,3231000111.0,9,no,no,urban,Sindh,6,female,1,Khairpur,3231,2014
287199,3111003102.0,10,no,no,urban,Sindh,6,female,1,Jacobabad,3111,2014
220767,2742000507.0,5,yes,yes,rural,Punjab,10,female,1,Khanewal,2742,2014


In [12]:
# PRE-PROCESSING FOR DIFF-IN-DIFF DATA
# convert currently_enrolled from string to integer
years14_15["currently_enrolled"].replace("yes", 1, inplace=True)
years14_15["currently_enrolled"].replace("no", 0, inplace=True)
# check region for anomalies
#years07_08_f["region"].replace(3, "urban", inplace=True)
# aggregate dataset for sample population
df_grp_1 = years14_15.groupby(["sex","subprovince","region"])["currently_enrolled"].count().reset_index()
# aggregate dataset for enrollment total
df_grp_2 = years14_15.groupby(["sex","subprovince","region"])["currently_enrolled"].sum().reset_index()
# merge data set
df_grp_merge = pd.merge(df_grp_1, df_grp_2, on=["sex","subprovince","region"], indicator=True)
# check merge
df_grp_merge._merge.value_counts()

both          456
right_only      0
left_only       0
Name: _merge, dtype: int64

In [13]:
df_grp_merge.columns

Index(['sex', 'subprovince', 'region', 'currently_enrolled_x',
       'currently_enrolled_y', '_merge'],
      dtype='object')

In [14]:
# compute enrollment_rate
df_grp_merge["rate_enrollment"] = df_grp_merge["currently_enrolled_y"]/df_grp_merge["currently_enrolled_x"]
# rename columns
df_grp_merge = df_grp_merge.rename(columns={"currently_enrolled_x":"sample_population", "currently_enrolled_y":"enrolled_total"})
# drop "_merge" column
df_grp_merge.drop(["_merge"], axis=1, inplace=True)

In [15]:
df_grp_merge.sample(5)

Unnamed: 0,sex,subprovince,region,sample_population,enrolled_total,rate_enrollment
208,male,Tank,rural,57,42.0,0.736842
346,female,Loralai,rural,37,20.0,0.540541
138,male,Multan,rural,223,200.0,0.896861
94,male,Khuzdar,rural,21,21.0,1.0
349,female,Lower dir,urban,419,306.0,0.73031


In [16]:
# save clean difference-in-difference data set
df_grp_merge.to_csv("../20_analysis/enrollment_clean/years14_15_merge_diff.csv")

In [17]:
#df_grp_merge.isna().any() # DO THIS BEFORE SUBMITTING THE CSV

In [18]:
years14_15.to_csv("clean_rashaad/years14_15.csv")