In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.simplefilter(action="ignore")

from statsmodels.stats.weightstats import ttest_ind
from scipy import stats
from sklearn import metrics

%matplotlib inline

## Load and Check Primary Dataset 
---

In [2]:
df = pd.read_stata("maindataset_thesis.dta")
df = df.query('tk01 != 5 and tk01 != 7') # exclude the retired
df.sample(5)

Unnamed: 0,sex,respondent,age,marstat,pidlink,ar02b,hhid07,pid07,hhid00,pid00,...,E_hr1type_t1,F_hr1type_t1,IHK_t1,IHK,nmkab2014,hhid14,pid14,_2014,married,single
5725,0,3,19.0,single,74130013,3,741300,13.0,,,...,2388851.88,657021.24,76.21,118.06,,741300.0,13.0,1,0,1
7452,1,3,15.0,single,194192104,13,1941921,4.0,,,...,43906501.6,4173251.08,84.13,115.36,SRAGEN,1941921.0,4.0,1,0,1
2550,1,3,20.0,single,185080005,3,1850800,5.0,1850800.0,5.0,...,1.0,2000001.0,41.94,77.88,BREBES,,,0,0,1
3556,1,3,23.0,single,266110003,3,2661100,3.0,2661100.0,3.0,...,1.0,8000001.0,44.42,77.8,DENPASAR,,,0,0,1
3838,0,3,22.0,single,283070004,3,2830700,4.0,2830700.0,4.0,...,3.0,2500001.0,40.81,73.87,LOMBOK TIMUR,,,0,0,1


In [3]:
df.shape

(9348, 48)

In [4]:
df.columns

Index(['sex', 'respondent', 'age', 'marstat', 'pidlink', 'ar02b', 'hhid07',
       'pid07', 'hhid00', 'pid00', 'tk02', 'tk03', 'tk04', 'tk01a', 'tk01b',
       'tk01c', 'tk01d', 'tk16d', 'tk16i', 'tk01', 'tk25a9x', 'tk16h', 'main',
       'NEET', 'NEET__just_other_', 'year', 'nonfood_t1', 'food_t1', 'pce_t1',
       'kabid_t1', 'nonfood_t', 'food_t', 'pce_t', 'kabid', 'A_hr1type_t',
       'E_hr1type_t', 'F_hr1type_t', 'A_hr1type_t1', 'E_hr1type_t1',
       'F_hr1type_t1', 'IHK_t1', 'IHK', 'nmkab2014', 'hhid14', 'pid14',
       '_2014', 'married', 'single'],
      dtype='object')

In [5]:
df["tk01"].value_counts()

3     3963
1     3197
4     1217
95     544
2      427
Name: tk01, dtype: int64

In [6]:
a = df.query('tk01 == 4 & NEET == 1')
b = df.query('tk01 == 95 & NEET == 1')
c = df.query('tk01 == 2 & NEET == 1')
d = df.query('tk01 == 1 & NEET == 0')
e = df.query('tk01 == 3 & NEET == 0')

In [7]:
df1 = pd.concat([a,b,c,d,e])
z = df1.index.tolist()

In [8]:
#make sure that main activity in line with our NEET definition
w = []
for i in df.index.tolist():
    if i not in z: # check wheter the main activity should be NEET but they have more than one activity which non-NEET
        w.append(i)

In [9]:
df2 = df.loc[w]
df2["tk01"] = df2[["tk01a", "tk02", "tk03", "tk04"]].sum(axis=1).apply(lambda x: 3 if x == 0 else 1) # "tk01b", "tk01c", "tk01d", "tk25a9x"

# I can't control the mutually exclusiveness within this group
print ("There are", len(df2), "samples based on main activity ALONE should classify to NEET yet actually not--based on series of activity") 

There are 561 samples based on main activity ALONE should classify to NEET yet actually not--based on series of activity


In [10]:
data = pd.concat([df1, df2])
data.sample(5)

Unnamed: 0,sex,respondent,age,marstat,pidlink,ar02b,hhid07,pid07,hhid00,pid00,...,E_hr1type_t1,F_hr1type_t1,IHK_t1,IHK,nmkab2014,hhid14,pid14,_2014,married,single
975,0,3,18.0,single,64200004,3,642031,3.0,642031.0,3.0,...,3.0,20001.0,37.94,72.13,BANDAR LAMPUNG,,,0,0,1
9256,1,3,20.0,single,311170005,3,3111700,5.0,,,...,100001.0,400001.0,77.45,113.71,BARRU,3111700.0,5.0,1,0,1
7568,1,3,17.0,single,202120004,3,2021200,4.0,,,...,13000001.0,1000001.0,78.26,116.88,BANTUL,2021200.0,4.0,1,0,1
973,1,3,15.0,single,64140005,3,641400,5.0,641400.0,5.0,...,3.0,3.0,37.94,72.13,BANDAR LAMPUNG,,,0,0,1
5707,0,3,18.0,single,73103103,10,731043,6.0,,,...,552201.9,1659001.04,79.08,117.73,BANGKALAN,731043.0,6.0,1,0,1


In [11]:
data.shape

(9348, 48)

## Gathering other control variables IFLS
---

### 2007

In [12]:
def tk01(data):
    if data == 1:
        return "Working"
    elif data == 2:
        return "Unemployed"
    elif data == 3:
        return "Student"
    elif data == 4:
        return "Housekeep"
    else:
        return "Other"

In [13]:
path07 = "C:/Users/Redata/Downloads/read/kemiskinan/hh2007/"
a = pd.read_stata(path07 + "pce07nom.dta")[["hhid07", "hhsize"]]
b = pd.read_stata(path07 + "bk_sc.dta")[["hhid07", "sc05", "sc21x"]]

#parent char
parent1 = pd.read_stata(path07 + "b3a_cov.dta")[["sex", "respndnt", "age", "hhid07", "pidlink"]]
parent1["respndnt"] = parent1["respndnt"].astype("str").str.extract(r'(\d+)')[0]
parent1 = parent1[parent1["respndnt"] == "1"].groupby(["hhid07"], as_index = False)[["age", "pidlink", "sex"]].max()

parent2 = pd.read_stata(path07 + "b3a_tk1.dta")[["tk01", "pidlink"]]
parent2.tk01 = parent2.tk01.astype("str").str.extract(r'(\d+)')[0].astype("float").apply(lambda x: tk01(x))
parent = pd.merge(parent1, parent2, left_on="pidlink", right_on="pidlink", how="left").drop_duplicates()
parent = parent.rename(columns={"sex":"hh_sex", "age":"hh_age", "tk01":"parent_tk01", "pidlink":"hh_pidlink"})

a_p = a.copy()
for i in [b,parent]:
    a_p = pd.merge(a_p,i, left_on="hhid07", right_on="hhid07", how="outer").drop_duplicates()

f = data[data["year"] == 2007]
e = pd.merge(f, a_p, left_on="hhid07", right_on="hhid07", how="left").drop_duplicates()

In [14]:
f.shape, e.shape

((4485, 48), (4485, 55))

In [15]:
sw = pd.read_stata(path07 +"b3a_sw.dta")[["pidlink", "sw01", "sw03b"]]
kk = pd.read_stata(path07 +"b3b_kk1.dta")[["pidlink", "kk01", "kk02a", "kk02c"]]

#education vairables
educ1 = pd.read_stata(path07 +"b3a_dl2.dta")[["dl2type", "pidlink", "dl16j", "dl16ja"]]
educ1_t = pd.merge(educ1.groupby(["pidlink"], as_index=False)["dl2type"].max(), 
         educ1, left_on=["pidlink", "dl2type"], right_on=["pidlink", "dl2type"])

educ2 = pd.read_stata(path07 +"b3a_dl1.dta")[["pidlink", "dl06", "dl07", "dl05b"]]

educ = pd.merge(educ1_t, educ2, left_on="pidlink", right_on="pidlink")

#the distance (in minutes) to school
#convert to minutes
educ.dl16ja = educ.dl16ja.apply(lambda x:60 if x == 2 else x) #change the a hour measurment
educ["dl16j"] = educ["dl16j"] * educ["dl16ja"]

#general health
health1 = pd.read_stata(path07 +"bus1_1.dta")[[ "pidlink", "us06"]]
health2 = pd.read_stata(path07 +"bus1_2.dta")[["pidlink", "us04"]]
health = pd.merge(health1, health2, left_on="pidlink", right_on="pidlink").groupby("pidlink", as_index=False)[["us06", "us04"]].mean()

#logical test
logic = pd.read_stata(path07 +"bek_ek2.dta", index_col="pidlink")[["ek1x", "ek2x", "ek3x", "ek4x", "ek5x", "ek6x", "ek7x", "ek8x",
                                             "ek9x", "ek10x", "ek11x", "ek12x", "ek13x", "ek14x", "ek15x", "ek16x", 
                                              "ek17x", "ek18x","ek19x", "ek20x"]]
for i in logic.columns:
    logic[i] = logic[i].apply(lambda x: 0 if x != 1 else 1)

logic["test"] = logic.sum(axis=1)/20
logic.reset_index(inplace=True)

### Additional variable

In [16]:
read = pd.read_stata( path07+ "b3a_dl1.dta")[["pidlink", "dl02", "dl03"]] 
#migra = pd.read_stata( path07+ "b3a_mg2.dta")[["pidlink", "mg27x", "mg36"]]
preference = pd.read_stata( path07+ "b3a_si.dta")[["pidlink", "si21b", "si22b", "si21a"]]
village = pd.read_stata( path07+ "b3a_tr.dta")[["pidlink", "tr06", "tr11"]]
tobaco = pd.read_stata( path07 + "b3b_km.dta")[["pidlink", "km01a"]]
election = pd.read_stata( path07+ "b3b_pm1.dta")[["pidlink", "pm24a", "pm24b", "pm24c","pm24d","pm24e","pm24f","pm24g","pm24h"]]

In [17]:
df = sw.copy()
for i in [kk, educ, health, logic[["test", "pidlink"]], read, preference, village, tobaco, election]:
    df = pd.merge(df, i, left_on="pidlink", right_on="pidlink", how="inner").drop_duplicates()

In [18]:
def educagg(data):
    one = [2, 11, 90, 72] #primary school
    two = [3, 4, 12, 73] #junior high school
    three = [5, 6, 15, 74] #high school
    four = [60, 61, 62, 63, 13] #college
    if data in one:
        return 1
    elif data in two:
        return 2
    elif data in three:
        return 3
    elif data in four:
        return 4
    else:
        return np.nan
    
df["educ"] = df["dl06"].apply(lambda x: educagg(x)) #aggregation
df["educ"] = df["educ"].fillna(df["dl2type"]) #fillna with another variable

In [19]:
data2007 = pd.merge(e, df, left_on="pidlink", right_on="pidlink", how="left").drop_duplicates()
data2007.shape,  data2007.pidlink.nunique()

((4485, 86), 4485)

### 2014

In [20]:
path14 = "C:/Users/Redata/Downloads/read/kemiskinan/hh2014/"
r = pd.read_stata(path14 + "pce14nom.dta")[["hhid14", "hhsize"]]
s = pd.read_stata(path14 + "bk_sc1.dta")[["hhid14", "sc05", "sc21x"]]

#parent char
parent1 = pd.read_stata(path14 + "b3a_cov.dta")[["sex", "rspndnt", "age", "hhid14", "pidlink"]]
parent1["rspndnt"] = parent1["rspndnt"].astype("str").str.extract(r'(\d+)')[0]
parent1 = parent1[parent1["rspndnt"] == "1"].groupby(["hhid14"], as_index = False)[["age", "pidlink", "sex"]].max()

parent2 = pd.read_stata(path14 + "b3a_tk1.dta")[["tk01", "pidlink"]]
parent2.tk01 = parent2.tk01.astype("str").str.extract(r'(\d+)')[0].astype("float").apply(lambda x: tk01(x))
parent = pd.merge(parent1, parent2, left_on="pidlink", right_on="pidlink", how="left").drop_duplicates()
parent = parent.rename(columns={"sex":"hh_sex", "age":"hh_age", "tk01":"parent_tk01", "pidlink":"hh_pidlink"})

a_p = r.copy()
for i in [s,parent]:
    a_p = pd.merge(a_p,i, left_on="hhid14", right_on="hhid14", how="outer").drop_duplicates()
    
v = data[data["year"] == 2014]
u = pd.merge(v, a_p, left_on="hhid14", right_on="hhid14", how="left").drop_duplicates()

In [21]:
u.shape, v.pidlink.nunique()

((4863, 55), 4863)

In [22]:
sw = pd.read_stata(path14 +"b3a_sw.dta")[["pidlink", "sw01", "sw03b"]]
kk = pd.read_stata(path14 +"b3b_kk1.dta")[["pidlink", "kk01", "kk02a", "kk02c"]]


#education vairables
educ1 = pd.read_stata(path14 +"b3a_dl2.dta")[["dl2type", "pidlink", "dl16j", "dl16ja"]]
educ1_t = pd.merge(educ1.groupby(["pidlink"], as_index=False)["dl2type"].max(), 
         educ1, left_on=["pidlink", "dl2type"], right_on=["pidlink", "dl2type"])

educ2 = pd.read_stata(path14 +"b3a_dl1.dta")[["pidlink", "dl06", "dl07", "dl05b"]]

educ = pd.merge(educ1_t, educ2, left_on="pidlink", right_on="pidlink").drop_duplicates()

#the distance (in minutes) to school
#convert to minutes
educ.dl16ja = educ.dl16ja.astype("str").str.extract(r'(\d+)')[0].apply(lambda x: 0 if x != "1" else 1).astype("float")
educ.dl16ja = educ.dl16ja.apply(lambda x:60 if x == 2 else x) #change the a hour measurment
educ["dl16j"] = educ["dl16j"].astype("float") * educ["dl16ja"]

#general health
health = pd.read_stata(path14 +"bus_us.dta")[[ "pidlink", "us06", "us04"]]

#logical test
logic = pd.read_stata(path14 +"ek_ek2.dta", index_col="pidlink")[["ek1_ans", "ek2_ans", "ek3_ans", "ek4_ans", "ek5_ans", "ek6_ans",
                                             "ek7_ans", "ek8_ans", "ek9_ans", "ek10_ans", "ek11_ans", "ek12_ans", 
                                             "ek13_ans", "ek14_ans", "ek15_ans", "ek16_ans", "ek17_ans", "ek18_ans",
                                             "ek19_ans", "ek20_ans", "ek21_ans", "ek22_ans"]]
for i in logic.columns:
    logic[i] = logic[i].astype("str").str.extract(r'(\d+?)')[0].astype("float").apply(lambda x: 0 if x != 1 else 1)

logic["test"] = logic.sum(axis=1)/22
logic.reset_index(inplace=True)

### Additional variable

In [23]:
read = pd.read_stata( path14+ "b3a_dl1.dta")[["pidlink", "dl02", "dl03"]] 
#migra = pd.read_stata( path14+ "b3a_mg2.dta")[["pidlink", "mg27x", "mg36"]]
preference = pd.read_stata( path14+ "b3a_si.dta")[["pidlink", "si21b", "si22b", "si21a"]]
village = pd.read_stata( path14+ "b3a_tr.dta")[["pidlink", "tr06", "tr11"]]
tobaco = pd.read_stata( path14 + "b3b_km.dta")[["pidlink", "km01a"]]
election = pd.read_stata( path14+ "b3b_pm1.dta")[["pidlink", "pm24a", "pm24b", "pm24c","pm24d","pm24e","pm24f","pm24g","pm24h"]]

In [24]:
df = sw.copy()
for i in [kk, educ, health, logic[["test", "pidlink"]], read, preference, village, tobaco, election]:
    df = pd.merge(df, i, left_on="pidlink", right_on="pidlink", how="outer").drop_duplicates()

In [25]:
df["educ"] = df["dl06"].apply(lambda x: educagg(x)) #aggregation
df["educ"] = df["educ"].fillna(df["dl2type"]) #fillna with another variable

In [26]:
data2014 = pd.merge(u, df, left_on="pidlink", right_on="pidlink", how="left").drop_duplicates()
data2014.shape,  data2014.pidlink.nunique()

((4863, 86), 4863)

## Concat both year

In [27]:
data = pd.concat([data2007, data2014])
data

Unnamed: 0,sex,respondent,age,marstat,pidlink,ar02b,hhid07,pid07,hhid00,pid00,...,km01a,pm24a,pm24b,pm24c,pm24d,pm24e,pm24f,pm24g,pm24h,educ
0,1,3,20.0,married,002184101,3,0021841,1.0,0021841,,...,1,3,3,3,3,3,3,3,3,2
1,0,3,22.0,single,003180005,3,0031800,5.0,0031800,5.0,...,3,1,1,1,1,1,6,1,1,3
2,0,3,24.0,single,004030005,3,0040321,9.0,0040321,8.0,...,3,3,3,3,3,3,3,3,3,1
3,0,3,19.0,single,004030007,3,0040321,4.0,0040321,4.0,...,3,6,6,6,6,6,6,6,6,3
4,0,3,20.0,single,004040006,3,0040400,6.0,0040400,6.0,...,3,1,1,1,1,1,6,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4858,0,3,20.0,single,321100006,3,3211000,6.0,,,...,3:No,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,3:No,3:Senior high
4859,0,3,23.0,single,321110003,3,3211100,3.0,,,...,3:No,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high
4860,1,3,20.0,single,321110005,3,3211100,5.0,,,...,1:Yes,3:No,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high
4861,0,3,24.0,married,321150006,3,3211500,6.0,,,...,3:No,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high


In [28]:
data.parent_tk01.value_counts()

Working       6933
Housekeep      838
Other          638
Unemployed      65
Student          4
Name: parent_tk01, dtype: int64

## Gathering Unemployement Rate --Sakernas

### Sakernas 2007
Based on work activity, job searching, temporary not working, build business, in order to classify to workforce and unemployed.

In [29]:
sakernas07 = pd.read_stata("Sakernas 2007 FF.dta")[["b1p01", "b1p02", "b4p2a1", "b4p3", "b4p4", "b4p5"]]
sakernas07["b1p02"] = sakernas07["b1p02"].apply(lambda x : str(x) if len(str((x))) > 1 else "0" + str(x))
sakernas07["kabid"] = sakernas07["b1p01"].astype("str") + sakernas07["b1p02"]
sakernas07.fillna(2, inplace=True)

In [30]:
sakernas07["sample"] = 1
populasi07 = sakernas07.groupby("kabid", as_index=False)["sample"].sum()

In [31]:
for i in ["b4p2a1", "b4p3", "b4p4", "b4p5"]:
    sakernas07[i] = sakernas07[i].apply(lambda x: x if x == 1 else 0)

In [32]:
sakernas07["pt"] = sakernas07[["b4p3", "b4p4", "b4p5"]].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)
sakernas07["ak"] = sakernas07[["b4p2a1", "b4p3", "b4p4", "b4p5"]].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)

In [33]:
tpt07 = sakernas07.groupby("kabid", as_index=False)[["pt", "ak"]].sum()
tpt07["unemployment_rate"] = tpt07["pt"] / tpt07["ak"]
tpt07["year"] = 2007

In [34]:
tpt07 = pd.merge(tpt07, populasi07, left_on="kabid", right_on="kabid", how="left")

### Sakernas 2014
Based on work force and unemployment variables -- more direct.

In [35]:
sakernas14 = pd.read_stata("sak_0814.dta")[["kode_pro", "kode_kab", "ak", "pt"]]
sakernas14["kode_kab"] = sakernas14["kode_kab"].apply(lambda x : str(x) if len(str((x))) > 1 else "0" + str(x))
sakernas14["kabid"] = sakernas14["kode_pro"].astype("str") + sakernas14["kode_kab"]

In [36]:
sakernas14["sample"] = 1
populasi14 = sakernas14.groupby("kabid", as_index=False)["sample"].sum()

In [37]:
sakernas14.ak = sakernas14.ak.apply(lambda x: 0 if x == np.nan else 1)
sakernas14.pt = sakernas14.pt.apply(lambda x: 0 if x == np.nan else 1)

In [38]:
tpt14 = sakernas14.groupby("kabid", as_index=False)[["pt", "ak"]].sum()
tpt14["unemployment_rate"] = tpt14["pt"] / tpt14["ak"]
tpt14["year"] = 2014

In [39]:
tpt14 = pd.merge(tpt14, populasi14, left_on="kabid", right_on="kabid", how="left")

### Combined them

In [40]:
tpt = pd.concat([tpt14, tpt07])
tpt["kabid"] = tpt["kabid"].astype("float")
tpt.sample(5)

Unnamed: 0,kabid,pt,ak,unemployment_rate,year,sample
474,9412.0,16.0,410.0,0.039024,2014,729
226,3504.0,21.0,921.0,0.022801,2014,1284
163,3207.0,39.0,942.0,0.041401,2014,1426
247,3525.0,40.0,915.0,0.043716,2014,1404
295,6203.0,125.0,1230.0,0.101626,2007,1774


## Merge Sakernas to IFLS

In [41]:
data = pd.merge(data, tpt, left_on=["year", "kabid"], right_on=["year", "kabid"], how="left")
data

Unnamed: 0,sex,respondent,age,marstat,pidlink,ar02b,hhid07,pid07,hhid00,pid00,...,pm24d,pm24e,pm24f,pm24g,pm24h,educ,pt,ak,unemployment_rate,sample
0,1,3,20.0,married,002184101,3,0021841,1.0,0021841,,...,3,3,3,3,3,2,69.0,1397.0,0.049392,2298.0
1,0,3,22.0,single,003180005,3,0031800,5.0,0031800,5.0,...,1,1,6,1,1,3,301.0,1454.0,0.207015,2627.0
2,0,3,24.0,single,004030005,3,0040321,9.0,0040321,8.0,...,3,3,3,3,3,1,118.0,1069.0,0.110384,1916.0
3,0,3,19.0,single,004030007,3,0040321,4.0,0040321,4.0,...,6,6,6,6,6,3,118.0,1069.0,0.110384,1916.0
4,0,3,20.0,single,004040006,3,0040400,6.0,0040400,6.0,...,1,1,6,1,1,2,198.0,1315.0,0.150570,1896.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9343,0,3,20.0,single,321100006,3,3211000,6.0,,,...,1:Yes,1:Yes,1:Yes,1:Yes,3:No,3:Senior high,66.0,749.0,0.088117,1326.0
9344,0,3,23.0,single,321110003,3,3211100,3.0,,,...,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high,66.0,749.0,0.088117,1326.0
9345,1,3,20.0,single,321110005,3,3211100,5.0,,,...,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high,66.0,749.0,0.088117,1326.0
9346,0,3,24.0,married,321150006,3,3211500,6.0,,,...,1:Yes,1:Yes,1:Yes,1:Yes,6:Not applicable,3:Senior high,66.0,749.0,0.088117,1326.0


## Cleaning Data
---

In [42]:
col_to_clean = ["educ", "dl05b", "dl07", "dl06", "dl2type", "kk02c", "kk01", "sw03b", "sw01", "sc21x", 
               "dl02", "dl03", "si21b", "si22b", "si21a", "sc05",
                "tr06", "tr11", "km01a", "pm24a", "pm24b", "pm24c","pm24d","pm24e","pm24f","pm24g","pm24h"]

for i in col_to_clean:
    data[i] = data[i].astype("str").str.extract(r'(\d+)')[0].astype("float")

In [43]:
binar = ["sc05", "dl02", "dl03", "si21b", "si22b", "si21a", 
         "km01a", "pm24a", "pm24b", "pm24c","pm24d","pm24e","pm24f","pm24g","pm24h"]
for i in binar:
    data[i] = data[i].astype("float").apply(lambda x: np.nan if x in [8, 9] else x).astype("float")
    
for i in binar:
    data[i] = data[i].astype("float").apply(lambda x: 0 if x not in [1, np.nan] else x).astype("float")

In [44]:
#Move out of the village, same province 
#Move out of the village, different province
#Move out of the village, same district
#Move out of the village, same sub-district
# >> as migran
data["sc21x"] = data["sc21x"].astype("float").apply(lambda x: np.nan if x in [99, 98] else x).astype("float")
data["sc21x"] = data["sc21x"].astype("float").apply(lambda x: 1 if x in [11, 12, 13, 14] else ( x if x == np.nan else 0)).astype("float")

In [45]:
data

Unnamed: 0,sex,respondent,age,marstat,pidlink,ar02b,hhid07,pid07,hhid00,pid00,...,pm24d,pm24e,pm24f,pm24g,pm24h,educ,pt,ak,unemployment_rate,sample
0,1,3,20.0,married,002184101,3,0021841,1.0,0021841,,...,0.0,0.0,0.0,0.0,0.0,2.0,69.0,1397.0,0.049392,2298.0
1,0,3,22.0,single,003180005,3,0031800,5.0,0031800,5.0,...,1.0,1.0,0.0,1.0,1.0,3.0,301.0,1454.0,0.207015,2627.0
2,0,3,24.0,single,004030005,3,0040321,9.0,0040321,8.0,...,0.0,0.0,0.0,0.0,0.0,1.0,118.0,1069.0,0.110384,1916.0
3,0,3,19.0,single,004030007,3,0040321,4.0,0040321,4.0,...,0.0,0.0,0.0,0.0,0.0,3.0,118.0,1069.0,0.110384,1916.0
4,0,3,20.0,single,004040006,3,0040400,6.0,0040400,6.0,...,1.0,1.0,0.0,1.0,1.0,2.0,198.0,1315.0,0.150570,1896.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9343,0,3,20.0,single,321100006,3,3211000,6.0,,,...,1.0,1.0,1.0,1.0,0.0,3.0,66.0,749.0,0.088117,1326.0
9344,0,3,23.0,single,321110003,3,3211100,3.0,,,...,1.0,1.0,1.0,1.0,0.0,3.0,66.0,749.0,0.088117,1326.0
9345,1,3,20.0,single,321110005,3,3211100,5.0,,,...,1.0,1.0,1.0,1.0,0.0,3.0,66.0,749.0,0.088117,1326.0
9346,0,3,24.0,married,321150006,3,3211500,6.0,,,...,1.0,1.0,1.0,1.0,0.0,3.0,66.0,749.0,0.088117,1326.0


In [46]:
data.hh_age = data.hh_age.astype("float")
data.hh_sex = data.hh_sex.astype("str").str.extract(r'(\d+)')[0].fillna("1").apply(lambda x: 1 if x == "1" else 0).astype("float")
data["parent_Working"] = pd.get_dummies(data.parent_tk01)["Working"]

In [47]:
data.to_stata("dataraw_thesis_redi.dta")