In [1]:
import pandas as pd

In [2]:
df0 = pd.read_stata("../00_source_data/1213_sec_c.dta", convert_categoricals=False)

In [3]:
# convert hhcode column to string for easy access
df0["hhcode"] = df0["hhcode"].astype("str")

# cleaning important columns
df0["ever_admitted"] = "no"
df0.loc[(df0["scq01"] == 1), "ever_admitted"] = "yes"

df0["currently_enrolled"] = "no"
df0.loc[(df0["scq05"] == 1), "currently_enrolled"] = "yes"

In [4]:
# Reducing to the questions of interest:
df0_rel = df0[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "currently_enrolled",
        "region",
        "province",
    ]
].copy()

In [5]:
df00 = pd.read_stata("../00_source_data/1213_roster.dta")

In [6]:
df00["hhcode"] = df00["hhcode"].astype("str")

df00["sex"] = df00["sbq03"]
df00["marital_status"] = df00["sbq06"]
df00["subprovince"] = df00["district"]

# modifying marital status to align with desired output
df00["marital_status"].replace(
    ["unmarried", "married", "widow", "divorced", "nikkah"],
    [1, 2, 3, 4, 5],
    inplace=True,
)

In [7]:
df00_rel = df00[["hhcode", "age", "idc", "sex", "marital_status", "subprovince"]].copy()

In [8]:
years12_13 = pd.merge(df0_rel, df00_rel, on=["hhcode", "idc"], how='inner', indicator=False)

In [9]:
# filtering the age for individuals between the ages of 4 - 15
years12_13 = years12_13.loc[(years12_13["age"] > 4) & (years12_13["age"] <= 10)]

years12_13.loc[(years12_13["region"] == 1), "region"] = "urban"
years12_13.loc[(years12_13["region"] == 2), "region"] = "rural"

years12_13["subprovince_code"] = years12_13["hhcode"].apply(lambda x: x[0:4])
years12_13["subprovince_code"] = years12_13["subprovince_code"].astype("int")

years12_13["province"] = years12_13["province"].astype("int")

years12_13["province"] = years12_13["province"].map(
    {1: "K.P.K", 2: "Punjab", 3: "Sindh", 4: "Balochistan", 6: "Islamabad"}
)

years12_13.subprovince = years12_13.subprovince.str.capitalize()

# creating year column with lowest year as value
years12_13["year"] = 2012

In [10]:
years12_13['subprovince'] = years12_13['subprovince'].fillna("Islamabad")

# move Islamabad to subprovince and make islamabad -> punjab
years12_13.loc[(years12_13["province"] == 'Islamabad'), "province"] = "Punjab"

In [11]:
years12_13.sample(10)

Unnamed: 0,hhcode,idc,ever_admitted,currently_enrolled,region,province,age,sex,marital_status,subprovince,subprovince_code,year
365456,3552001111.0,8,yes,yes,rural,Sindh,10,male,1,Sukkur,3552,2012
404184,4421000810.0,9,no,no,urban,Balochistan,9,male,1,Qilla abdullah,4421,2012
317914,3342002512.0,8,no,no,rural,Sindh,9,male,1,Shahdadkot,3342,2012
17480,1322002213.0,7,no,yes,rural,K.P.K,8,male,1,Batagram,1322,2012
10662,1222000411.0,8,no,no,rural,K.P.K,5,male,1,Tank,1222,2012
21104,1342000814.0,5,no,no,rural,K.P.K,6,female,1,Kohistan,1342,2012
283002,3211100506.0,7,no,yes,urban,Sindh,6,female,1,Karachi,3211,2012
333415,3422004803.0,6,yes,yes,rural,Sindh,10,female,1,Sanghar,3422,2012
395151,4322001410.0,5,no,no,rural,Balochistan,10,male,1,Jhal magsi,4322,2012
393236,4312001512.0,7,no,no,rural,Balochistan,8,male,1,Jaffarabad,4312,2012


In [12]:
# PRE-PROCESSING FOR DIFF-IN-DIFF DATA
# convert currently_enrolled from string to integer
years12_13["currently_enrolled"].replace("yes", 1, inplace=True)
years12_13["currently_enrolled"].replace("no", 0, inplace=True)
# check region for anomalies
#years07_08_f["region"].replace(3, "urban", inplace=True)
# aggregate dataset for sample population
df_grp_1 = years12_13.groupby(["sex","subprovince","region"])["currently_enrolled"].count().reset_index()
# aggregate dataset for enrollment total
df_grp_2 = years12_13.groupby(["sex","subprovince","region"])["currently_enrolled"].sum().reset_index()
# merge data set
df_grp_merge = pd.merge(df_grp_1, df_grp_2, on=["sex","subprovince","region"], indicator=True)
# check merge
df_grp_merge._merge.value_counts()

both          456
right_only      0
left_only       0
Name: _merge, dtype: int64

In [13]:
# compute enrollment_rate
df_grp_merge["rate_enrollment"] = df_grp_merge["currently_enrolled_y"]/df_grp_merge["currently_enrolled_x"]
# rename columns
df_grp_merge = df_grp_merge.rename(columns={"currently_enrolled_x":"sample_population", "currently_enrolled_y":"enrolled_total"})
# drop "_merge" column
df_grp_merge.drop(["_merge"], axis=1, inplace=True)
df_grp_merge.sample(5)

Unnamed: 0,sex,subprovince,region,sample_population,enrolled_total,rate_enrollment
59,male,Hyderabad,urban,247,206.0,0.834008
352,female,Manshera,rural,201,159.0,0.791045
129,male,Mastung,urban,68,61.0,0.897059
315,female,Khairpur,urban,125,82.0,0.656
243,female,Bhakar,urban,79,57.0,0.721519


In [14]:
# save clean difference-in-difference data set
df_grp_merge.to_csv("../20_analysis/enrollment_clean/years12_13_merge_diff.csv")

In [15]:
#df_grp_merge.isna().any() # DO THIS BEFORE SUBMITTING THE CSV

In [16]:
years12_13.to_csv("clean_rashaad/years12_13.csv")