In [1]:
import pandas as pd

In [2]:
df2 = pd.read_stata("1415_sec_c.dta", convert_categoricals=False)

In [3]:
df2["hhcode"] = df2["hhcode"].astype("str")

# cleaning admitted & enrolled
df2["ever_admitted"] = "no"
df2.loc[(df2["scq03"] == 1), "ever_admitted"] = "yes"

df2["currently_enrolled"] = "no"
df2.loc[(df2["scq05"] == 1), "currently_enrolled"] = "yes"

df2["max_level_achieved"] = df2["scq04"]

In [4]:
# Reduce to the questions of interest (4 questions):
df2_rel = df2[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "currently_enrolled",
        "region",
        "province"
    ]
].copy()

In [5]:
df20 = pd.read_stata("1415_sec_b.dta")

In [6]:
df20.sample(10)

Unnamed: 0,hhcode,psu,province,region,district,sec,idc,sbq02,sbq03,sbq04,sbq05,sbq61,sbq62,sbq63,age,sbq07,sbq08,sbq09,sbq10,sbq11
365941,3232001000.0,32320005,sindh,urban,khairpur,00B,7,son/daughter,family elder,female,present,2011,0,0,3,unmarried/never married,,1,2,yes
246799,2742001000.0,27420006,punjab,urban,khanewal,00B,3,nephew\niece,main economic provider,female,present,2010,6,0,4,unmarried/never married,,2,99,yes
38932,1231001000.0,12310014,kp,rural,batagram,00B,2,spouse,main economic provider,female,present,1963,0,0,51,currently married,1.0,98,99,yes
17853,1151003000.0,11510025,kp,rural,shangla,00B,6,son/daughter,main economic provider,male,present,2008,0,0,6,unmarried/never married,,1,2,yes
108957,2141003000.0,21410032,punjab,rural,chakwal,00B,6,son/daughter,main economic provider,female,present,2000,0,0,14,unmarried/never married,,1,2,yes
122666,2231000000.0,22310003,punjab,rural,khushab,00B,6,son/daughter,main economic provider,male,present,2003,0,0,11,unmarried/never married,,1,2,yes
489362,4432000000.0,44320004,balochistan,urban,nasirabad/ tamboo,00B,11,son/daughter,family elder,male,present,2007,0,0,7,unmarried/never married,,1,2,yes
336177,3131003000.0,31310028,sindh,rural,shikarpur,00B,3,son/daughter,family elder,female,present,1989,0,0,25,unmarried/never married,,1,2,yes
421337,3422001000.0,34220005,sindh,urban,mirpur khas,00B,3,son/daughter,main economic provider,female,present,1991,0,0,23,unmarried/never married,,1,2,yes
57472,1321002000.0,13210016,kp,rural,swabi,00B,4,son/daughter,family elder,male,present,2003,0,0,11,unmarried/never married,,1,2,yes


In [7]:
df20["hhcode"] = df20["hhcode"].astype("str")

df20['sex'] = df20['sbq04']
df20['marital_status'] = df20['sbq07']
df20['subprovince'] = df20['district']

In [8]:
df20_rel = df20[['hhcode', 'age','idc', 'sex', 'marital_status', 'subprovince']].copy()

In [9]:
years14_15 = pd.merge(df2_rel, df20_rel, on=["hhcode", "idc"], how='inner', indicator=False)

In [10]:
years14_15 = years14_15.loc[(years14_15["age"] > 4)&(years14_15["age"] <= 15)]

years14_15.loc[(years14_15["region"] == 1), 'region'] = 'urban'
years14_15.loc[(years14_15["region"] == 2), 'region'] = 'rural'

years14_15["subprovince code"] = years14_15["hhcode"].apply(lambda x: x[0:4])
years14_15["subprovince code"] = years14_15["subprovince code"].astype("int")

years14_15["province"] = years14_15["province"].astype("int")

years14_15["province"] = years14_15["province"].map(
    {1: "K.P.K", 2: "Punjab", 3: "Sindh", 4: "Balochistan"}
)

# modifying marital status to align with desired output
years14_15["marital_status"].replace(
    [
        "unmarried/never married",
        "currently married",
        "widow/widower",
        "divorced",
        "nikah has been solemnised but the rukhsati has  not taken place",
    ],
    [1, 2, 3, 4, 5],
    inplace=True,
)

years14_15.subprovince = years14_15.subprovince.str.capitalize()

years14_15['year'] = 2014

In [11]:
years14_15.sample(10)

Unnamed: 0,hhcode,idc,ever_admitted,currently_enrolled,region,province,age,sex,marital_status,subprovince,subprovince code,year
364927,3381001803.0,7,no,no,urban,Sindh,10,male,1,Thatta,3381,2014
344858,3341001010.0,5,no,no,urban,Sindh,12,female,1,Tando allah yar,3341,2014
340312,3321003210.0,6,yes,yes,urban,Sindh,11,male,1,Jamshoro,3321,2014
309673,3151002810.0,15,yes,yes,urban,Sindh,7,male,1,Shahdadkot,3151,2014
289346,3112000109.0,3,yes,yes,rural,Sindh,8,male,1,Jacobabad,3112,2014
135525,2331004416.0,4,yes,yes,urban,Punjab,11,female,1,Jhang,2331,2014
311602,3152000412.0,3,yes,yes,rural,Sindh,12,male,1,Shahdadkot,3152,2014
334610,3252001008.0,4,yes,yes,rural,Sindh,12,male,1,Shaheed benazir abad,3252,2014
138113,2331007314.0,5,yes,no,urban,Punjab,13,female,1,Jhang,2331,2014
87570,2111003302.0,3,yes,yes,urban,Punjab,10,female,1,Attock,2111,2014


In [12]:
# PRE-PROCESSING FOR DIFF-IN-DIFF DATA
# convert currently_enrolled from string to integer
years14_15["currently_enrolled"].replace("yes", 1, inplace=True)
years14_15["currently_enrolled"].replace("no", 0, inplace=True)
# check region for anomalies
#years07_08_f["region"].replace(3, "urban", inplace=True)
# aggregate dataset for sample population
df_grp_1 = years14_15.groupby(["sex","subprovince","region"])["currently_enrolled"].count().reset_index()
# aggregate dataset for enrollment total
df_grp_2 = years14_15.groupby(["sex","subprovince","region"])["currently_enrolled"].sum().reset_index()
# merge data set
df_grp_merge = pd.merge(df_grp_1, df_grp_2, on=["sex","subprovince","region"], indicator=True)
# check merge
df_grp_merge._merge.value_counts()

both          456
left_only       0
right_only      0
Name: _merge, dtype: int64

In [13]:
df_grp_merge.columns

Index(['sex', 'subprovince', 'region', 'currently_enrolled_x',
       'currently_enrolled_y', '_merge'],
      dtype='object')

In [14]:
# compute enrollment_rate
df_grp_merge["rate_enrollment"] = df_grp_merge["currently_enrolled_y"]/df_grp_merge["currently_enrolled_x"]
# rename columns
df_grp_merge = df_grp_merge.rename(columns={"currently_enrolled_x":"sample_population", "currently_enrolled_y":"enrolled_total"})
# drop "_merge" column
df_grp_merge.drop(["_merge"], axis=1, inplace=True)

In [15]:
df_grp_merge.sample(5)

Unnamed: 0,sex,subprovince,region,sample_population,enrolled_total,rate_enrollment
136,male,Mirpur khas,rural,172,141,0.819767
220,male,Vehari,rural,64,56,0.875
264,female,D. i. khan,rural,54,42,0.777778
240,female,Bannu,rural,24,23,0.958333
415,female,Sheikhupura,urban,716,563,0.786313


In [16]:
# save clean difference-in-difference data set
#df_grp_merge.to_csv("years14_15_merge_diff.csv")

In [17]:
#df_grp_merge.isna().any() # DO THIS BEFORE SUBMITTING THE CSV

In [18]:
years14_15.to_csv("years14_15.csv")