In [3]:
import pandas as pd
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from statsmodels.formula.api import ols
from linearmodels.panel import PanelOLS


file_path = '/Users/rebeccluo/Downloads/US_Paid_leave_analysis.dta'

# Load the file into a DataFrame
df = pd.read_stata(file_path)
print(df.columns.tolist())

['ssuid', 'spanel', 'swave', 'srefmon', 'rhcalmn', 'rhcalyr', 'tfipsst', 'epppnum', 'esex', 'wpfinwgt', 'tage', 'eeducate', 'rmesr', 'birth_month', 'sippid', 'months', 'date', 'birth', 'birth_seen_f', 'birth_seen', 'ref_month_ns', 'ref_month', 'state', 'end_date', 'end_weight_f', 'end_weight', 'CA_date', 'NJ_date', 'post_policy', 'rm_lfp', 'working', 'looking', 'lt_college_f', 'lt_college', 'Birth', '_IBirth_2', '_IBirth_3', '_IBirth_4', '_IBirth_5', '_IBirth_6', '_IBirth_7', '_IBirth_8', '_IBirth_9', '_IBirth_10', '_IBirth_11', '_IBirth_12', '_IBirth_13', '_IBirth_14', '_IBirth_15', '_IBirth_16', '_IBirth_17', '_IBirth_18', '_IBirth_19', '_IBirth_20', '_IBirth_21', '_IBirth_22', '_IBirth_23', '_IBirth_24', '_IBirth_25', '_IBirth_26', '_IBirth_27', '_IBirth_28', '_IBirth_29', '_IBirth_30', '_IBirth_31', '_IBirth_32', '_IBirth_33', '_IBirth_34', '_IBirth_35', '_IBirth_36', '_IBirth_37', '_IBirth_38', '_IBirth_39', '_IBirth_40', '_IBirth_41', '_IBirth_42', '_IBirth_43', '_IBirth_44', '_I

In [4]:
unique_data = df.drop_duplicates(subset='sippid').dropna(subset=['rm_lfp'])
print(unique_data.shape)

(2816, 231)


In [5]:
birth_vars = [f'_IBirth_{i}' for i in range(2, 52)]  # Birth dummies from _IBirth_2 to _IBirth_51
birxpos_vars = [f'_IBirXpos_{i}_1' for i in range(2, 52)]  # Event-study dummies _IBirXpos_2_1 to _IBirXpos_50_1
llbirth_vars = [f'_LlBirth_{2}_1'] + [f'_LlBirth_{i}_1' for i in range(8, 51)]   # Reference period dummies from _LlBirth_8 to _LlBirth_50
llbipos_vars = [f'_LlBiXpos_{2}_1'] + [f'_LlBiXpos_{i}_1' for i in range(8, 51)]  # Event-study reference period _LlBiXpos_8_1 to _LlBiXpos_50_1, what we are interested in


In [68]:
import warnings

# Suppress all warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    year_dummies = pd.get_dummies(unique_data['rhcalyr'], drop_first=True)  # Time dummy for year
    birth_dummies = pd.get_dummies(unique_data[birth_vars], drop_first=False)  # Dummies for birth_vars
    state_dummies = pd.get_dummies(unique_data['state'], drop_first=True)  # Dummies for states (assuming 'state' column)
    sippid_dummies = pd.get_dummies(unique_data['sippid'], prefix='sippid', drop_first=True)
    month_dummies = pd.get_dummies(unique_data['ref_month'], prefix='month', drop_first=True)
    edu_dummies = pd.get_dummies(unique_data['lt_college'], prefix='college', drop_first=True)
    age_group_dummies = pd.get_dummies(unique_data['age_group'],prefix='age', drop_first=True)  # Dummies for age groups
    recession_dummies = pd.get_dummies(unique_data['recession_birth'],prefix='recession', drop_first=True)  # Dummies for recession years

    
    age_recession_interactions = pd.DataFrame(index=unique_data.index)
    for age_col in age_group_dummies.columns:
        for recession_col in recession_dummies.columns:
            interaction_name = f'{age_col}_recession_{recession_col}'
            age_recession_interactions[interaction_name] = age_group_dummies[age_col] * recession_dummies[recession_col]



    # Step 2: Create interaction terms between birth_vars and time (rhcalyr)
    birth_time_interactions = pd.DataFrame(index=unique_data.index)
    for birth_col in birth_dummies.columns:
        for year_col in year_dummies.columns:
            interaction_name = f'{birth_col}_time_{year_col}'
            birth_time_interactions[interaction_name] = birth_dummies[birth_col] * year_dummies[year_col]

    # Step 3: Create interaction terms between birth_vars and state
    birth_state_interactions = pd.DataFrame(index=unique_data.index)
    for birth_col in birth_dummies.columns:
        for state_col in state_dummies.columns:
            interaction_name = f'{birth_col}_state_{state_col}'
            birth_state_interactions[interaction_name] = birth_dummies[birth_col] * state_dummies[state_col]
            
    state_edu_interactions = pd.DataFrame(index=unique_data.index)
    for state_col in state_dummies.columns:
        for edu_col in edu_dummies.columns:
            interaction_name = f'{state_col}_edu_{edu_col}'  # Name the interaction term
            state_edu_interactions[interaction_name] = state_dummies[state_col] * edu_dummies[edu_col]

    state_year_interactions = pd.DataFrame(index=unique_data.index)
    for state_col in state_dummies.columns:
        for year_col in year_dummies.columns:
            interaction_name = f'{state_col}_time_{year_col}'
            state_year_interactions[interaction_name] = state_dummies[state_col] * year_dummies[year_col]

X = pd.concat([year_dummies,state_dummies,age_group_dummies,
    recession_dummies,age_recession_interactions,month_dummies,state_year_interactions,edu_dummies], axis=1)

#no missing values
X = X.fillna(0)
X.columns = X.columns.astype(str)

#complete feature matrix ready for analysis or modeling
print(X.shape)

(2816, 95)


In [69]:

X = X.reset_index(drop=True)

#Create a DataFrame for the outcome and treatment variables
outcome_treatment_df = pd.DataFrame({
    'rm_lfp': unique_data['rm_lfp'].values,           # Outcome variable
    'post_policy': unique_data['post_policy'].values   # Treatment variable
})

#Concatenate the feature matrix 
if isinstance(X, list):
    X = pd.DataFrame(X)

propensity_data = pd.concat([X, outcome_treatment_df], axis=1)
outcome_treatment_df = outcome_treatment_df.reset_index(drop=True)


print("Shape of the propensity data DataFrame:", propensity_data.shape)
print(propensity_data.head())  


Shape of the propensity data DataFrame: (2816, 97)
    1996   1997   1998   1999   2000   2001   2002   2003   2004   2005  ...  \
0  False  False   True  False  False  False  False  False  False  False  ...   
1  False  False   True  False  False  False  False  False  False  False  ...   
2  False  False  False   True  False  False  False  False  False  False  ...   
3   True  False  False  False  False  False  False  False  False  False  ...   
4  False   True  False  False  False  False  False  False  False  False  ...   

   Texas_time_2006  Texas_time_2007  Texas_time_2008  Texas_time_2009  \
0            False            False            False            False   
1            False            False            False            False   
2            False            False            False            False   
3            False            False            False            False   
4            False            False            False            False   

   Texas_time_2010  Texas_tim

In [72]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def inverse_probability_weighting(outcome_treatment_df, X, outcome_variable='rm_lfp', treatment_variable='post_policy'):
   

    treated = outcome_treatment_df[treatment_variable].values
    

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Estimate propensity scores using logistic regression
    log_reg = LogisticRegression(
        solver='liblinear',  # Good for small datasets
        class_weight='balanced',  # Handles imbalanced treatment groups
        random_state=42  # For reproducibility
    )
    log_reg.fit(X_scaled, treated)
    
    # Compute propensity scores 
    propensity_scores = log_reg.predict_proba(X_scaled)[:, 1]
    
    # Calculate Stabilized Weights
    # Compute inverse probability weights
    treated_weight = treated / propensity_scores
    control_weight = (1 - treated) / (1 - propensity_scores)
    
    # Stabilize weights
    p_treatment = np.mean(treated)
    stabilized_weights = np.where(
        treated == 1, 
        treated_weight / np.mean(treated_weight),  # Stabilized weights for treated
        control_weight / np.mean(control_weight)   # Stabilized weights for control
    )
    
    # Compute Weighted Outcomes
    outcome = outcome_treatment_df[outcome_variable].values
    
    # Compute weighted mean outcomes
    weighted_treated_mean = np.average(
        outcome[treated == 1], 
        weights=stabilized_weights[treated == 1]
    )
    
    weighted_control_mean = np.average(
        outcome[treated == 0], 
        weights=stabilized_weights[treated == 0]
    )
    
    # Compute ATE
    ate = weighted_treated_mean - weighted_control_mean
    
    # Additional diagnostics
    return {
        'ATE': ate,
        'Weighted Treated Mean': weighted_treated_mean,
        'Weighted Control Mean': weighted_control_mean,
        'Propensity Scores': propensity_scores,
        'Stabilized Weights': stabilized_weights,
        'Treatment Probability': p_treatment
    }


def main():
    
    results = inverse_probability_weighting(
        outcome_treatment_df, 
        X, 
        outcome_variable='rm_lfp', 
        treatment_variable='post_policy'
    )
    
    # Print results
    for key, value in results.items():
        print(f"{key}: {value}")

    # Diagnostics 
    print("\nDiagnostics:")
    print(f"Mean of stabilized weights: {np.mean(results['Stabilized Weights']):.4f}")
    print(f"Std of stabilized weights: {np.std(results['Stabilized Weights']):.4f}")

if __name__ == "__main__":
    main()

ATE: 0.061738022002769344
Weighted Treated Mean: 0.6550119237328784
Weighted Control Mean: 0.5932739017301091
Propensity Scores: [0.00127469 0.00228141 0.0013607  ... 0.99355231 0.99023292 0.00106997]
Stabilized Weights: [0.85435166 0.85521371 0.85442524 ... 6.1904165  6.21116762 0.85417657]
Treatment Probability: 0.15518465638160706

Diagnostics:
Mean of stabilized weights: 2.0000
Std of stabilized weights: 2.9694
