### TWFE

In [16]:
import pandas as pd
from linearmodels.panel import PanelOLS
file_path = 'data/medicaid.csv'
data = pd.read_csv(file_path, encoding='latin1')
display(data.head())
data['PostEvent'] = (data['year'] >= data['AdoptedYear']).astype(int)
data = data.set_index(["County", "year"])
data = data.dropna(subset=['uninsured', 'median_income', 'unemployment', 'PostEvent'])
formula = "uninsured ~ PostEvent + median_income + unemployment + EntityEffects + TimeEffects"
mod = PanelOLS.from_formula(formula, data=data)
results = mod.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
print(results.summary)

Unnamed: 0,State,id,unemployment,median_income,uninsured,year,County,MedicaidStatus,AdoptedYear,SID
0,Alabama,0500000US01003,5.6,50900,12.0,2011,Baldwin County,Not Adopted,0,1
1,Alabama,0500000US01015,7.5,39037,15.6,2011,Calhoun County,Not Adopted,0,2
2,Alabama,0500000US01043,5.5,40054,12.6,2011,Cullman County,Not Adopted,0,3
3,Alabama,0500000US01049,7.1,36541,19.6,2011,DeKalb County,Not Adopted,0,4
4,Alabama,0500000US01051,6.5,57405,10.6,2011,Elmore County,Not Adopted,0,5


                          PanelOLS Estimation Summary                           
Dep. Variable:              uninsured   R-squared:                        0.1090
Estimator:                   PanelOLS   R-squared (Between):             -0.7782
No. Observations:                7341   R-squared (Within):               0.2476
Date:                Thu, Nov 21 2024   R-squared (Overall):             -0.7234
Time:                        20:09:17   Log-likelihood                -1.622e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      271.92
Entities:                         660   P-value                           0.0000
Avg Obs:                       11.123   Distribution:                  F(3,6670)
Min Obs:                       1.0000                                           
Max Obs:                       81.000   F-statistic (robust):             23.104
                            

In [32]:
import pandas as pd

# Load the CSV file
file_path = 'data/medicaid.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Handle missing AdoptedYear
data['AdoptedYear'] = data['AdoptedYear'].fillna(0)  # Treat missing AdoptedYear as no adoption (can change if you have specific logic)

# Calculate time_to_event (only for counties that adopted the treatment)
data['time_to_event'] = data.apply(lambda row: row['year'] - row['AdoptedYear'] if row['AdoptedYear'] != 0 else None, axis=1)

# Define max lags and leads
max_lag = 4
max_lead = 4

# Create Lag and Lead variables, only for rows where time_to_event is not null
for i in range(-max_lag, max_lead + 1):
    if i < 0:
        data[f'Lag {-i}'] = data.apply(lambda row: 1 if row['time_to_event'] == i else 0 if row['time_to_event'] is not None else None, axis=1)
    elif i > 0:
        data[f'Lead {i}'] = data.apply(lambda row: 1 if row['time_to_event'] == i else 0 if row['time_to_event'] is not None else None, axis=1)

# Create PostEvent variable: 1 if time_to_event >= 0, else 0
data['PostEvent'] = data.apply(lambda row: 1 if row['time_to_event'] >= 0 else 0 if row['time_to_event'] is not None else None, axis=1)

# Select relevant columns for output
columns_to_include = ['uninsured', 'County', 'year', 'AdoptedYear', 'PostEvent', 'time_to_event'] + \
                     [f'Lag {i}' for i in range(1, max_lag + 1)] + [f'Lead {i}' for i in range(1, max_lead + 1)]

# Filter and organize the data
data_final = data[columns_to_include]

# Display the final dataset (or save it if needed)

data_final.head(100)


Unnamed: 0,uninsured,County,year,AdoptedYear,PostEvent,time_to_event,Lag 1,Lag 2,Lag 3,Lag 4,Lead 1,Lead 2,Lead 3,Lead 4
0,12.0,Baldwin County,2011,0,0,,0,0,0,0,0,0,0,0
1,15.6,Calhoun County,2011,0,0,,0,0,0,0,0,0,0,0
2,12.6,Cullman County,2011,0,0,,0,0,0,0,0,0,0,0
3,19.6,DeKalb County,2011,0,0,,0,0,0,0,0,0,0,0
4,10.6,Elmore County,2011,0,0,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,10.1,Madison County,2015,0,0,,0,0,0,0,0,0,0,0
96,12.0,Marshall County,2015,0,0,,0,0,0,0,0,0,0,0
97,11.5,Mobile County,2015,0,0,,0,0,0,0,0,0,0,0
98,11.9,Montgomery County,2015,0,0,,0,0,0,0,0,0,0,0


In [33]:
import pandas as pd
from linearmodels.panel import PanelOLS

# Load the processed data
file_path = '/mnt/data/event_study_formatted_data.csv'  # Replace with the actual formatted dataset path
data = data_final

# Set the panel data indices (County and year)
data = data.set_index(["County", "year"])

# Define the formula for the event study model using TWFE with lag/lead dummies
# Replace 'uninsured' with the outcome variable of interest
lags = " + ".join([f"`Lag {i}`" for i in range(1, 5)])  # Include Lag 1 to Lag 4
leads = " + ".join([f"`Lead {i}`" for i in range(1, 5)])  # Include Lead 1 to Lead 4
formula = f"uninsured ~ {lags} + {leads} + EntityEffects + TimeEffects"

# Fit the model using PanelOLS
mod = PanelOLS.from_formula(formula, data)

# Estimate the model with clustered standard errors by entity (County)
results = mod.fit(cov_type='clustered', cluster_entity=True)

# Display the regression results
print(results.summary)


                          PanelOLS Estimation Summary                           
Dep. Variable:              uninsured   R-squared:                        0.0514
Estimator:                   PanelOLS   R-squared (Between):             -0.0900
No. Observations:                7380   R-squared (Within):               0.1439
Date:                Thu, Nov 21 2024   R-squared (Overall):             -0.0749
Time:                        20:18:17   Log-likelihood                -1.656e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      45.388
Entities:                         660   P-value                           0.0000
Avg Obs:                       11.182   Distribution:                  F(8,6704)
Min Obs:                       1.0000                                           
Max Obs:                       81.000   F-statistic (robust):             19.441
                            

In [28]:
### 

(7380, 13)

In [38]:
import rdata
import pandas as pd

# Read the RDA file
data = rdata.read_rda('data/divorce.rda')

# Convert the R dataframe to a pandas DataFrame
df = pd.DataFrame(data['divorce'])

# Display the DataFrame
display(df.head())




Unnamed: 0,stfips,year,post,asmrs,pcinc,asmrh,cases
1,1.0,1964.0,0.0,35.639885,12406.178537,5.007341,0.012312
2,1.0,1965.0,0.0,41.543755,13070.206738,4.425367,0.010419
3,1.0,1966.0,0.0,34.252335,13526.663217,4.874819,0.0099
4,1.0,1967.0,0.0,34.465023,13918.189823,5.362014,0.009975
5,1.0,1968.0,0.0,40.440105,14684.808682,4.643759,0.012401
