In [1]:
import pandas as pd

In [2]:
df2 = pd.read_stata("1415_sec_c.dta", convert_categoricals=False)

In [3]:
df2["hhcode"] = df2["hhcode"].astype("str")

# cleaning admitted & enrolled
df2["ever_admitted"] = "no"
df2.loc[(df2["scq03"] == 1), "ever_admitted"] = "yes"

df2["currently_enrolled"] = "no"
df2.loc[(df2["scq05"] == 1), "currently_enrolled"] = "yes"

df2["max_level_achieved"] = df2["scq04"]

In [4]:
# Reduce to the questions of interest (4 questions):
df2_rel = df2[
    [
        "hhcode",
        "idc",
        "ever_admitted",
        "currently_enrolled",
        "region",
        "province"
    ]
].copy()

In [5]:
df20 = pd.read_stata("1415_sec_b.dta")

In [6]:
df20.sample(10)

Unnamed: 0,hhcode,psu,province,region,district,sec,idc,sbq02,sbq03,sbq04,sbq05,sbq61,sbq62,sbq63,age,sbq07,sbq08,sbq09,sbq10,sbq11
340646,3141001000.0,31410011,sindh,rural,larkana,00B,4,brother\sister,main provider away for work,female,present,1989,0,0,25,unmarried/never married,,3,98,yes
64699,1422200000.0,14222004,kp,urban,peshawar,00B,2,spouse,others,female,present,1976,0,0,38,currently married,1.0,99,99,yes
54169,1311002000.0,13110015,kp,rural,mardan,00B,4,grand child,family elder,male,present,2011,9,0,3,unmarried/never married,,99,2,yes
167204,2412200000.0,24122001,punjab,urban,gujranwala,00B,3,son/daughter,main economic provider,female,present,2005,0,0,9,unmarried/never married,,1,2,yes
15425,1151000000.0,11510003,kp,rural,shangla,00B,10,son/daughter,main economic provider,male,present,2007,3,15,7,unmarried/never married,,1,2,yes
119793,2221004000.0,22210044,punjab,rural,bhakkar,00B,3,son/daughter,main economic provider,female,present,1992,0,0,22,unmarried/never married,,1,2,yes
484300,4421001000.0,44210010,balochistan,rural,jaffarabad,00B,7,son/daughter,family elder,female,present,2008,0,0,6,unmarried/never married,,1,2,yes
493426,4511001000.0,45110007,balochistan,rural,kalat,00B,2,spouse,family elder,female,present,1989,0,0,25,currently married,1.0,99,99,yes
133297,2241004000.0,22410043,punjab,rural,mianwali,00B,3,son/daughter,main economic provider,male,present,2001,0,0,13,unmarried/never married,,1,2,yes
510817,6111000000.0,61110004,punjab,rural,islamabad,00B,2,son/daughter,main provider away for work,female,present,2002,0,0,12,unmarried/never married,,99,1,yes


In [7]:
df20["hhcode"] = df20["hhcode"].astype("str")

df20['sex'] = df20['sbq04']
df20['marital_status'] = df20['sbq07']
df20['subprovince'] = df20['district']

In [8]:
df20_rel = df20[['hhcode', 'age','idc', 'sex', 'marital_status', 'subprovince']].copy()

In [9]:
years14_15 = pd.merge(df2_rel, df20_rel, on=["hhcode", "idc"], how='inner', indicator=False)

In [10]:
years14_15 = years14_15.loc[(years14_15["age"] > 4)&(years14_15["age"] <= 15)]

years14_15.loc[(years14_15["region"] == 1), 'region'] = 'urban'
years14_15.loc[(years14_15["region"] == 2), 'region'] = 'rural'

years14_15["subprovince code"] = years14_15["hhcode"].apply(lambda x: x[0:4])
years14_15["subprovince code"] = years14_15["subprovince code"].astype("int")

years14_15["province"] = years14_15["province"].astype("int")

years14_15["province"] = years14_15["province"].map(
    {1: "K.P.K", 2: "Punjab", 3: "Sindh", 4: "Balochistan"}
)

# modifying marital status to align with desired output
years14_15["marital_status"].replace(
    [
        "unmarried/never married",
        "currently married",
        "widow/widower",
        "divorced",
        "nikah has been solemnised but the rukhsati has  not taken place",
    ],
    [1, 2, 3, 4, 5],
    inplace=True,
)

years14_15.subprovince = years14_15.subprovince.str.capitalize()

years14_15['year'] = 2014

In [11]:
years14_15.province.unique()

array(['K.P.K', 'Punjab', 'Sindh', 'Balochistan'], dtype=object)

In [12]:
years14_15.sample(10)

Unnamed: 0,hhcode,idc,ever_admitted,currently_enrolled,region,province,age,sex,marital_status,subprovince,subprovince code,year
245119,2841005413.0,3,no,no,urban,Punjab,14,male,1,Muzaffargarh,2841,2014
82560,1721000907.0,5,yes,yes,urban,K.P.K,8,male,1,Tank,1721,2014
14035,1151000503.0,4,yes,yes,urban,K.P.K,14,male,1,Shangla,1151,2014
384750,3441002901.0,3,no,no,urban,Sindh,10,female,1,Tharparkar,3441,2014
75075,1621001605.0,4,yes,yes,urban,K.P.K,11,female,1,Lakki marwat,1621,2014
159228,2441000413.0,8,yes,yes,urban,Punjab,12,female,1,Mandi bahauddin,2441,2014
407882,4231000116.0,5,yes,yes,urban,Balochistan,10,male,1,Musakhel,4231,2014
63557,1511002204.0,3,yes,yes,urban,K.P.K,12,male,1,Kohat,1511,2014
49480,1312000104.0,6,yes,yes,rural,K.P.K,14,male,1,Mardan,1312,2014
95832,2141000907.0,5,yes,yes,urban,Punjab,12,female,1,Chakwal,2141,2014


In [13]:
years14_15.to_csv("years14_15_f.csv")