# Simulate employment and wages for Sakernas
Using IFLS data, I simulate "employ" and "wage" for Sakernas observations

In [1]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (11, 5)  #set default figure size
import numpy as np
from numpy import exp
from scipy.special import factorial
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
from statsmodels.api import Poisson
from scipy.stats import norm
from statsmodels.iolib.summary2 import summary_col

In [57]:
# Load Datasets
ifls = pd.read_csv('/Users/shafiranw/Documents/PhD/Dissertation/code_data/data/ifls_match_032325.csv')
sakernas = pd.read_csv('/Users/shafiranw/Documents/PhD/Dissertation/code_data/data/sakernas_match_032325.csv')

# 1. Simulate "ln_wage"

In [58]:
# Only use observations with positive wages
ifls_filtered = ifls[ifls['wage'] > 0].copy()

# Define independent variables for IFLS
x_ifls = ifls_filtered[['age_before', 'age2_before', 'male', 'married', 'educ', 'hhsize', 'village', 'disability']]
x_ifls = sm.add_constant(x_ifls)
y_ifls = ifls_filtered['ln_wage']

# Fit the IFLS regression model
ols_wage = sm.OLS(y_ifls, x_ifls, missing="drop").fit()
betas_wage = ols_wage.params

# Results
print(ols_wage.summary())


                            OLS Regression Results                            
Dep. Variable:                ln_wage   R-squared:                       0.153
Model:                            OLS   Adj. R-squared:                  0.152
Method:                 Least Squares   F-statistic:                     195.8
Date:                Mon, 24 Mar 2025   Prob (F-statistic):          8.49e-306
Time:                        13:25:09   Log-Likelihood:                -12215.
No. Observations:                8702   AIC:                         2.445e+04
Df Residuals:                    8693   BIC:                         2.451e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          12.2122      0.111    109.919      

In [59]:
# Only apply to people who were in prakerja
sakernas_filtered = sakernas.dropna(subset=['age_before'])

# Define same independent variables for Sakernas
x_sakernas = sakernas_filtered[['age_before', 'age2_before', 'male', 'married', 'educ', 'hhsize', 'village', 'disability']]
x_sakernas = sm.add_constant(x_sakernas)[betas_wage.index]

# Simulate log wage for Sakernas using the betas from the IFLS regression
sakernas_filtered['ln_wage_before'] = x_sakernas.dot(betas_wage)

# View the simulated wages
sakernas_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sakernas_filtered['ln_wage_before'] = x_sakernas.dot(betas_wage)


Unnamed: 0,id,age,age2,male,married,educ,hhsize,village,disability,employ,...,pk_inc_daily,pk_inc_capital,pk_inc_debt,pk_inc_transport,pk_inc_internet,pk_inc_train,pk_inc_other,age_before,age2_before,ln_wage_before
40,41,55,3025,1,1,12,5,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,2809.0,14.121347
161,162,52,2704,0,1,12,3,0,0,0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,49.0,2401.0,13.780685
168,169,22,484,0,1,12,4,0,0,1,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,20.0,400.0,13.572028
344,345,44,1936,1,1,12,5,1,0,1,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,41.0,1681.0,14.122311
464,465,34,1156,0,1,16,4,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,900.0,13.966224


# 2. Adjust for inflation

In [60]:
# Define CPIs
cpi_values = {
    2014: 124.4,
    2019: 151.2,
    2020: 154.1,
    2021: 156.5,
    2022: 163.1
}

# Map pk_accept_year to the corresponding year before program acceptance
year_before = {
    1: 2019,  # Accepted in 2020 → Adjust to 2019
    2: 2020,  # Accepted in 2021 → Adjust to 2020
    3: 2021,  # Accepted in 2022 → Adjust to 2021
    4: 2022,  # Accepted in 2023 → Adjust to 2022
}

# Apply inflation adjustment dynamically
def adjust_for_inflation(row):
    target_year = year_before.get(row['pk_accept_year'])  # Default to 2019 if missing
    inflation_factor = cpi_values[target_year] / cpi_values[2014]  # Adjust from 2014 to target year
    return row['ln_wage_before'] + np.log(inflation_factor)

# Apply function to simulated wages
sakernas_filtered['ln_wage_before'] = sakernas_filtered.apply(adjust_for_inflation, axis=1)

# View
sakernas_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sakernas_filtered['ln_wage_before'] = sakernas_filtered.apply(adjust_for_inflation, axis=1)


Unnamed: 0,id,age,age2,male,married,educ,hhsize,village,disability,employ,...,pk_inc_daily,pk_inc_capital,pk_inc_debt,pk_inc_transport,pk_inc_internet,pk_inc_train,pk_inc_other,age_before,age2_before,ln_wage_before
40,41,55,3025,1,1,12,5,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,2809.0,14.392208
161,162,52,2704,0,1,12,3,0,0,0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,49.0,2401.0,14.010239
168,169,22,484,0,1,12,4,0,0,1,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,20.0,400.0,13.84289
344,345,44,1936,1,1,12,5,1,0,1,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,41.0,1681.0,14.351865
464,465,34,1156,0,1,16,4,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,900.0,14.180324


# 3. Revert back to wage and merge with Sakernas

In [61]:
# Revert back to wage
sakernas_filtered['wage_before'] = np.exp(sakernas_filtered['ln_wage_before'])

# View
sakernas_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sakernas_filtered['wage_before'] = np.exp(sakernas_filtered['ln_wage_before'])


Unnamed: 0,id,age,age2,male,married,educ,hhsize,village,disability,employ,...,pk_inc_capital,pk_inc_debt,pk_inc_transport,pk_inc_internet,pk_inc_train,pk_inc_other,age_before,age2_before,ln_wage_before,wage_before
40,41,55,3025,1,1,12,5,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,53.0,2809.0,14.392208,1780149.0
161,162,52,2704,0,1,12,3,0,0,0,...,1.0,0.0,1.0,1.0,1.0,0.0,49.0,2401.0,14.010239,1214981.0
168,169,22,484,0,1,12,4,0,0,1,...,1.0,0.0,1.0,1.0,1.0,0.0,20.0,400.0,13.84289,1027757.0
344,345,44,1936,1,1,12,5,1,0,1,...,1.0,0.0,1.0,1.0,1.0,0.0,41.0,1681.0,14.351865,1709763.0
464,465,34,1156,0,1,16,4,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,30.0,900.0,14.180324,1440245.0


In [62]:
# Merge back with original Sakernas dataset
sakernas = sakernas.merge(
    sakernas_filtered[['id', 'ln_wage_before', 'wage_before']],  # Keep only necessary columns
    on='id',  # Merge on unique identifier
    how='left'  # Keep all original sakernas rows
)

# Check if merge was successful
print(sakernas[['id', 'ln_wage_before', 'wage_before']].head())

   id  ln_wage_before  wage_before
0   1             NaN          NaN
1   2             NaN          NaN
2   3             NaN          NaN
3   4             NaN          NaN
4   5             NaN          NaN


In [63]:
# Save dataset
sakernas.to_csv('sakernas_wagesim_032425.csv', index=False)