In [83]:
import pandas as pd
import numpy as np
import ast
import re

In [90]:
reprisk_incidents = pd.read_csv(
    'data/wrds_reprisk_incidents_ids_material_short_all.csv')  # Calculation in "CreateDataset" of "ESGmateriality" project
reprisk_incidents = reprisk_incidents[
    ['gvkey', 'isin', 'cusip', 'reprisk_id', 'story_id', 'adopter', 'sic', 'SICS Codified Industry ',
     'Codified SICS Sector ', 'severity', 'reach', 'novelty', 'incident_list', 'incident_date', 'car_5', 'car_1',
     'material_flag']]

In [91]:
# Create Year, YearMonth, YearQuarter columns
reprisk_incidents['incident_date'] = pd.to_datetime(reprisk_incidents['incident_date'])
reprisk_incidents['Year'] = reprisk_incidents['incident_date'].dt.year
reprisk_incidents['YearMonth'] = reprisk_incidents['incident_date'].dt.to_period('M')
reprisk_incidents['YearQuarter'] = reprisk_incidents['incident_date'].dt.to_period('Q')

# 1. Calculate materiality based on rolling window

In [92]:
reprisk_incidents['incident_list'] = reprisk_incidents['incident_list'].apply(ast.literal_eval)
exploded = reprisk_incidents[
    ['SICS Codified Industry ', 'Year', 'YearQuarter', 'YearMonth', 'reach', 'car_1', 'car_5', 'severity',
     'incident_list']] \
    .explode('incident_list') \
    .rename(columns={'incident_list': 'type'})

In [94]:
def pick_types_by_rolling_12m_period_dropna_balanced(
        df,
        industry_col='SICS Codified Industry ',
        ym_col='YearMonth',  # must be Period[M]
        type_col='type',
        metrics=('reach', 'severity', 'car_1', 'car_5'),
        window=12
):
    d = df.copy()
    d[ym_col] = d[ym_col].astype('period[M]')
    d = d[[industry_col, ym_col, type_col, *metrics]].copy()

    # numeric coercion, keep NaNs
    for m in metrics:
        d[m] = pd.to_numeric(d[m], errors='coerce')

    # aggregate to unique (industry, type, month)
    d = (
        d.groupby([industry_col, type_col, ym_col], as_index=False)[list(metrics)]
        .sum(min_count=1)
    )

    # ---- Balanced panel over global YearMonth (for existing (industry,type) pairs only)
    global_idx = pd.period_range(d[ym_col].min(), d[ym_col].max(), freq='M')
    pairs = d[[industry_col, type_col]].drop_duplicates()
    full = (
        pairs.assign(_k=1)
        .merge(pd.DataFrame({ym_col: global_idx, '_k': 1}), on='_k')
        .drop(columns='_k')
        .set_index([industry_col, type_col, ym_col])
    )
    base = d.set_index([industry_col, type_col, ym_col])
    d = full.join(base, how='left').reset_index()

    # rolling 12m sums by (industry, type); NaNs remain NaN
    d = d.sort_values([industry_col, type_col, ym_col])
    for m in metrics:
        d[f'roll12_{m}'] = (
            d.groupby([industry_col, type_col], group_keys=False)[m]
            .transform(lambda s: s.rolling(window, min_periods=1).sum())
        )

    by = [industry_col, ym_col]

    def pick_one(rollcol, take='max'):
        tmp = d[[industry_col, ym_col, type_col, rollcol]].dropna(subset=[rollcol]).copy()
        # tie-breaker: alphabetical type; for min we sort ascending by value, for max we invert via sort order
        asc_val = (take == 'min')
        tmp = tmp.sort_values(by + [rollcol, type_col],
                              ascending=[True, True, asc_val, True])
        picked = tmp.drop_duplicates(subset=by, keep='first')
        metric = rollcol.replace('roll12_', '')
        picked = picked.rename(columns={
            type_col: f'{metric}_type',
            rollcol: f'{metric}_roll12'
        })
        return picked[[industry_col, ym_col, f'{metric}_type', f'{metric}_roll12']]

    reach_out = pick_one('roll12_reach', 'max')
    severity_out = pick_one('roll12_severity', 'max')
    car1_out = pick_one('roll12_car_1', 'min')
    car5_out = pick_one('roll12_car_5', 'min')

    res = (reach_out.merge(severity_out, on=by, how='outer')
           .merge(car1_out, on=by, how='outer')
           .merge(car5_out, on=by, how='outer')
           .sort_values(by)
           .reset_index(drop=True))
    return res

In [95]:
# ---------------- Usage ----------------
result = pick_types_by_rolling_12m_period_dropna_balanced(exploded)
result

Unnamed: 0,SICS Codified Industry,YearMonth,reach_type,reach_roll12,severity_type,severity_roll12,car_1_type,car_1_roll12,car_5_type,car_5_roll12
0,Advertising & Marketing,2007-12,discrimination_in_employment,1.0,discrimination_in_employment,1.0,discrimination_in_employment,-0.006904,discrimination_in_employment,-0.000448
1,Advertising & Marketing,2008-01,discrimination_in_employment,1.0,discrimination_in_employment,1.0,discrimination_in_employment,-0.006904,discrimination_in_employment,-0.000448
2,Advertising & Marketing,2008-02,discrimination_in_employment,1.0,discrimination_in_employment,1.0,discrimination_in_employment,-0.006904,discrimination_in_employment,-0.000448
3,Advertising & Marketing,2008-03,discrimination_in_employment,1.0,discrimination_in_employment,1.0,discrimination_in_employment,-0.006904,discrimination_in_employment,-0.000448
4,Advertising & Marketing,2008-04,discrimination_in_employment,1.0,discrimination_in_employment,1.0,discrimination_in_employment,-0.006904,discrimination_in_employment,-0.000448
...,...,...,...,...,...,...,...,...,...,...
13044,Water Utilities & Services,2023-08,local_pollution,96.0,impacts_on_communities,67.0,impacts_on_communities,-0.047760,impacts_on_communities,-0.110693
13045,Water Utilities & Services,2023-09,local_pollution,98.0,impacts_on_communities,66.0,discrimination_in_employment,-0.011628,occupational_health,-0.109424
13046,Water Utilities & Services,2023-10,violation_of_natl_legislation,114.0,violation_of_natl_legislation,77.0,discrimination_in_employment,-0.011628,occupational_health,-0.109424
13047,Water Utilities & Services,2023-11,violation_of_natl_legislation,117.0,violation_of_natl_legislation,78.0,discrimination_in_employment,-0.011628,occupational_health,-0.109424


In [96]:
# Merge back to original dataframe
final = reprisk_incidents.merge(
    result,
    on=['SICS Codified Industry ', 'YearMonth'],
    how='left'
)
final

Unnamed: 0,gvkey,isin,cusip,reprisk_id,story_id,adopter,sic,SICS Codified Industry,Codified SICS Sector,severity,...,YearMonth,YearQuarter,reach_type,reach_roll12,severity_type,severity_roll12,car_1_type,car_1_roll12,car_5_type,car_5_roll12
0,210418.0,US0003752047,000375204,2,1251.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,2007-06,2007Q2,violation_of_natl_legislation,28.0,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264
1,210418.0,US0003752047,000375204,2,1305.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,2007-07,2007Q3,violation_of_natl_legislation,28.0,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264
2,210418.0,US0003752047,000375204,2,4029.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,2008-04,2008Q2,impacts_on_landscapes,43.0,human_rights_abuses,41.0,climate_ghg_pollution,-0.129756,impacts_on_communities,-0.282756
3,210418.0,US0003752047,000375204,2,4488.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,2008-05,2008Q2,human_rights_abuses,47.0,human_rights_abuses,54.0,climate_ghg_pollution,-0.129756,supply_chain_issues,-0.250865
4,210418.0,US0003752047,000375204,2,4756.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,2007-01,2007Q1,violation_of_natl_legislation,21.0,violation_of_natl_legislation,11.0,impacts_on_communities,-0.004342,impacts_on_communities,-0.007518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331365,355240.0,THB131010001,,2683218,,1,5900.0,Coal Operations,Extractives & Minerals Processing,0.0,...,NaT,NaT,,,,,,,,
331366,349631.0,PK0126301016,,2684126,,1,3411.0,Containers & Packaging,Resource Transformation,0.0,...,NaT,NaT,,,,,,,,
331367,350690.0,GB00BLNNFY18,,2685085,,1,6726.0,Household & Personal Products,Consumer Goods,0.0,...,NaT,NaT,,,,,,,,
331368,350726.0,AU0000180200,,2685460,,1,1400.0,Chemicals,Resource Transformation,0.0,...,NaT,NaT,,,,,,,,


In [97]:
def _in_list(t, lst):
    if pd.isna(t):
        return 0
    return int(str(t) in lst)


# --- 3) create materiality flags
final['materiality_reach'] = [_in_list(t, L) for t, L in zip(final['reach_type'], final['incident_list'])]
final['materiality_severity'] = [_in_list(t, L) for t, L in zip(final['severity_type'], final['incident_list'])]
final['materiality_car_1'] = [_in_list(t, L) for t, L in zip(final['car_1_type'], final['incident_list'])]
final['materiality_car_5'] = [_in_list(t, L) for t, L in zip(final['car_5_type'], final['incident_list'])]

In [98]:
final

Unnamed: 0,gvkey,isin,cusip,reprisk_id,story_id,adopter,sic,SICS Codified Industry,Codified SICS Sector,severity,...,severity_type,severity_roll12,car_1_type,car_1_roll12,car_5_type,car_5_roll12,materiality_reach,materiality_severity,materiality_car_1,materiality_car_5
0,210418.0,US0003752047,000375204,2,1251.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264,0,0,1,1
1,210418.0,US0003752047,000375204,2,1305.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264,0,0,0,0
2,210418.0,US0003752047,000375204,2,4029.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,human_rights_abuses,41.0,climate_ghg_pollution,-0.129756,impacts_on_communities,-0.282756,0,1,0,1
3,210418.0,US0003752047,000375204,2,4488.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,human_rights_abuses,54.0,climate_ghg_pollution,-0.129756,supply_chain_issues,-0.250865,1,1,0,0
4,210418.0,US0003752047,000375204,2,4756.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,violation_of_natl_legislation,11.0,impacts_on_communities,-0.004342,impacts_on_communities,-0.007518,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331365,355240.0,THB131010001,,2683218,,1,5900.0,Coal Operations,Extractives & Minerals Processing,0.0,...,,,,,,,0,0,0,0
331366,349631.0,PK0126301016,,2684126,,1,3411.0,Containers & Packaging,Resource Transformation,0.0,...,,,,,,,0,0,0,0
331367,350690.0,GB00BLNNFY18,,2685085,,1,6726.0,Household & Personal Products,Consumer Goods,0.0,...,,,,,,,0,0,0,0
331368,350726.0,AU0000180200,,2685460,,1,1400.0,Chemicals,Resource Transformation,0.0,...,,,,,,,0,0,0,0


In [99]:
final.to_csv('data/incidents_rolling_materiality.csv', index=False)

## 1.2 Add industry incident counts

In [115]:
df = final.copy()

# Coerce flags to numeric and treat missing as 0
flag_cols = [
    'material_flag',
    'materiality_car_1', 'materiality_car_5', 'materiality_reach', 'materiality_severity'
]
df[flag_cols] = df[flag_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

key = ["gvkey", "YearMonth"]

# 1) Monthly counts per firm
agg_fm = (
    df.groupby(key, dropna=False)
    .agg(
        n_material=('material_flag', lambda s: (s == 1).sum()),
        n_nonmaterial=('material_flag', lambda s: (s == 0).sum()),
        n_car_1_material=('materiality_car_1', lambda s: (s == 1).sum()),
        n_car_5_material=('materiality_car_5', lambda s: (s == 1).sum()),
        n_reach_material=('materiality_reach', lambda s: (s == 1).sum()),
        n_severity_material=('materiality_severity', lambda s: (s == 1).sum()),
    )
    .reset_index()
)

# 2) Balanced universe: all firms × all months
firms  = df['gvkey'].dropna().unique()
mmin   = df['YearMonth'].min()
mmax   = df['YearMonth'].max()
months = pd.period_range(mmin, mmax, freq='M')

balanced_idx      = pd.MultiIndex.from_product([firms, months], names=key)
balanced_universe = pd.DataFrame(index=balanced_idx).reset_index()

# 3) Merge and fill gaps with 0, sort
reprisk = (
    balanced_universe
    .merge(agg_fm, on=key, how='left')
    .fillna(0)
    .sort_values(key)
    .reset_index(drop=True)
)

count_cols = [
    'n_material', 'n_nonmaterial', 'n_car_1_material', 'n_car_5_material',
    'n_reach_material', 'n_severity_material'
]

# Ensure monthly counts are int64
reprisk[count_cols] = reprisk[count_cols].astype('int64')

# 4) 24-month rolling sums per firm (includes current month)
roll_24m = (
    reprisk.groupby('gvkey', group_keys=False)[count_cols]
    .rolling(window=24, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
    .astype('int64')
).add_suffix('_24m')

# 5) Keep keys + monthly counts + rolling sums
reprisk_out = pd.concat([reprisk[key + count_cols], roll_24m], axis=1)

reprisk_out

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,n_car_1_material_24m,n_car_5_material_24m,n_reach_material_24m,n_severity_material_24m
0,1004.0,2007-01,0,0,0,0,0,0,0,0,0,0,0,0
1,1004.0,2007-02,0,0,0,0,0,0,0,0,0,0,0,0
2,1004.0,2007-03,0,0,0,0,0,0,0,0,0,0,0,0
3,1004.0,2007-04,0,0,0,0,0,0,0,0,0,0,0,0
4,1004.0,2007-05,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10640431,367496.0,2023-08,0,0,0,0,0,0,0,0,0,0,0,0
10640432,367496.0,2023-09,0,0,0,0,0,0,0,0,0,0,0,0
10640433,367496.0,2023-10,0,0,0,0,0,0,0,0,0,0,0,0
10640434,367496.0,2023-11,0,0,0,0,0,0,0,0,0,0,0,0


In [116]:
reprisk_merged = reprisk_out.merge(
    reprisk_incidents[['gvkey', 'cusip', 'SICS Codified Industry ', 'Codified SICS Sector ']].drop_duplicates(),
    on='gvkey', how='left')
reprisk_merged

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,n_car_1_material_24m,n_car_5_material_24m,n_reach_material_24m,n_severity_material_24m,cusip,SICS Codified Industry,Codified SICS Sector
0,1004.0,2007-01,0,0,0,0,0,0,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,Resource Transformation
1,1004.0,2007-02,0,0,0,0,0,0,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,Resource Transformation
2,1004.0,2007-03,0,0,0,0,0,0,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,Resource Transformation
3,1004.0,2007-04,0,0,0,0,0,0,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,Resource Transformation
4,1004.0,2007-05,0,0,0,0,0,0,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,Resource Transformation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10643287,367496.0,2023-08,0,0,0,0,0,0,0,0,0,0,0,0,,Real Estate,Infrastructure
10643288,367496.0,2023-09,0,0,0,0,0,0,0,0,0,0,0,0,,Real Estate,Infrastructure
10643289,367496.0,2023-10,0,0,0,0,0,0,0,0,0,0,0,0,,Real Estate,Infrastructure
10643290,367496.0,2023-11,0,0,0,0,0,0,0,0,0,0,0,0,,Real Estate,Infrastructure


In [117]:
reprisk_merged['year'] = reprisk_merged['YearMonth'].dt.year
agg_cols = [
    'n_material_24m',
    'n_nonmaterial_24m',
    'n_car_1_material_24m',
    'n_car_5_material_24m',
    'n_reach_material_24m',
    'n_severity_material_24m',
]

industry_col = 'SICS Codified Industry '  # use your exact column name
ym_col = 'YearMonth'  # can be string or Period[M]

# 1) Industry-level monthly counts (sum across firms)
industry_counts = (
    reprisk_merged.groupby([industry_col, ym_col], as_index=False)[agg_cols]
    .sum(min_count=1)  # keeps NaN if an entire group/col is NaN
    .rename(columns={c: f'industry_{c}' for c in agg_cols})
)

# 2) (Optional) merge back so each firm-month has the industry counts attached
df_with_industry = reprisk_merged.merge(industry_counts, on=[industry_col, ym_col], how='left')
df_with_industry.dropna(subset=['cusip'], inplace=True)
df_with_industry

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,cusip,SICS Codified Industry,Codified SICS Sector,year,industry_n_material_24m,industry_n_nonmaterial_24m,industry_n_car_1_material_24m,industry_n_car_5_material_24m,industry_n_reach_material_24m,industry_n_severity_material_24m
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,000361105,Industrial Machinery & Goods,Resource Transformation,2007,1,8,0,0,5,5
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,000361105,Industrial Machinery & Goods,Resource Transformation,2007,1,19,2,2,12,12
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,000361105,Industrial Machinery & Goods,Resource Transformation,2007,2,26,3,3,18,18
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,000361105,Industrial Machinery & Goods,Resource Transformation,2007,3,38,3,3,25,25
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,000361105,Industrial Machinery & Goods,Resource Transformation,2007,3,50,4,4,33,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10543939,356128.0,2023-08,0,0,0,0,0,0,0,0,...,48581R205,Consumer Finance,Financials,2023,230,36,49,49,164,164
10543940,356128.0,2023-09,0,0,0,0,0,0,0,0,...,48581R205,Consumer Finance,Financials,2023,235,36,48,47,168,168
10543941,356128.0,2023-10,0,0,0,0,0,0,0,0,...,48581R205,Consumer Finance,Financials,2023,235,35,47,46,171,171
10543942,356128.0,2023-11,0,0,0,0,0,0,0,0,...,48581R205,Consumer Finance,Financials,2023,234,34,48,46,169,169


In [125]:
df_with_industry.to_csv('data/incidents_rolling_industry_all.csv', index=False)

In [124]:
df_with_industry.groupby('YearMonth')[['n_material', 'n_nonmaterial', 'n_material_24m', 'n_nonmaterial_24m']].sum().sort_values('n_material', ascending=False)

Unnamed: 0_level_0,n_material,n_nonmaterial,n_material_24m,n_nonmaterial_24m
YearMonth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01,983,599,15522,8299
2017-10,857,566,14263,8295
2020-09,801,333,15990,8248
2014-05,799,349,11220,5062
2014-11,799,587,13182,6265
...,...,...,...,...
2007-09,60,65,581,349
2007-03,58,38,133,94
2007-04,47,23,180,117
2007-02,39,35,75,56


In [113]:
df_with_industry.groupby(['SICS Codified Industry ', 'YearMonth'])['n_car_1_material_24m'].agg(
        mean='mean',
        q20=lambda s: s.quantile(0.20),
        q40=lambda s: s.quantile(0.40),
        q60=lambda s: s.quantile(0.60),
        q80=lambda s: s.quantile(0.80),
        q90=lambda s: s.quantile(0.90),
        n='count'
    ).reset_index()

Unnamed: 0,SICS Codified Industry,YearMonth,mean,q20,q40,q60,q80,q90,n
0,Advertising & Marketing,2007-01,0.000000,0.0,0.0,0.0,0.0,0.0,16
1,Advertising & Marketing,2007-02,0.000000,0.0,0.0,0.0,0.0,0.0,16
2,Advertising & Marketing,2007-03,0.000000,0.0,0.0,0.0,0.0,0.0,16
3,Advertising & Marketing,2007-04,0.000000,0.0,0.0,0.0,0.0,0.0,16
4,Advertising & Marketing,2007-05,0.000000,0.0,0.0,0.0,0.0,0.0,16
...,...,...,...,...,...,...,...,...,...
13867,Water Utilities & Services,2023-08,2.181818,0.0,0.0,0.0,2.0,2.9,22
13868,Water Utilities & Services,2023-09,2.136364,0.0,0.0,0.0,1.8,2.9,22
13869,Water Utilities & Services,2023-10,2.136364,0.0,0.0,0.0,1.0,2.9,22
13870,Water Utilities & Services,2023-11,1.954545,0.0,0.0,0.0,1.0,2.0,22


In [114]:
df_with_industry.groupby(['SICS Codified Industry ', 'YearMonth'])['industry_n_car_1_material_24m'].agg(
        mean='mean',
        q20=lambda s: s.quantile(0.20),
        q40=lambda s: s.quantile(0.40),
        q60=lambda s: s.quantile(0.60),
        q80=lambda s: s.quantile(0.80),
        q90=lambda s: s.quantile(0.90),
        n='count'
    ).reset_index()

Unnamed: 0,SICS Codified Industry,YearMonth,mean,q20,q40,q60,q80,q90,n
0,Advertising & Marketing,2007-01,0.0,0.0,0.0,0.0,0.0,0.0,16
1,Advertising & Marketing,2007-02,0.0,0.0,0.0,0.0,0.0,0.0,16
2,Advertising & Marketing,2007-03,0.0,0.0,0.0,0.0,0.0,0.0,16
3,Advertising & Marketing,2007-04,0.0,0.0,0.0,0.0,0.0,0.0,16
4,Advertising & Marketing,2007-05,0.0,0.0,0.0,0.0,0.0,0.0,16
...,...,...,...,...,...,...,...,...,...
13867,Water Utilities & Services,2023-08,107.0,107.0,107.0,107.0,107.0,107.0,22
13868,Water Utilities & Services,2023-09,104.0,104.0,104.0,104.0,104.0,104.0,22
13869,Water Utilities & Services,2023-10,100.0,100.0,100.0,100.0,100.0,100.0,22
13870,Water Utilities & Services,2023-11,93.0,93.0,93.0,93.0,93.0,93.0,22


# 2. Merge financials

In [126]:
reprisk_merged_yearly = pd.read_csv('data/incidents_rolling_industry_all.csv')

In [127]:
reprisk_merged_yearly['YearQuarter'] = pd.to_datetime(reprisk_merged_yearly['YearMonth']).dt.to_period('Q')
reprisk_merged_yearly['YearQuarter_prior'] = reprisk_merged_yearly['YearQuarter'] - 1
reprisk_merged_yearly['YearQuarter_prior'] = reprisk_merged_yearly['YearQuarter_prior'].astype(str)

In [128]:
import wrds
from pathlib import Path
from dotenv import load_dotenv
import os

project_root = Path(r"E:\GermanBusinessPanelTeam\Schrader\Forschung\ESGmateriality")

# ── Load WRDS creds from wrds.env in the project root ─────────────────────────
env_path = project_root / "wrds.env"
load_dotenv(dotenv_path=env_path)
# ── Connect to WRDS ───────────────────────────────────────────────────────────
db = wrds.Connection(
    wrds_username=os.getenv("WRDS_YALE_USERNAME"),
    wrds_password=os.getenv("WRDS_YALE_PASSWORD")
)

Loading library list...
Done


In [129]:
# ---------- tiny helper: keys + single Compustat QUARTERLY pull ----------

def _parse_yearquarter(s):
    """
    Accepts strings like '2022Q3', '2022-Q3', '2022 Q3', Periods, or timestamps.
    Returns a pandas.Period (freq='Q-DEC').
    """
    if pd.isna(s):
        return pd.NaT
    if isinstance(s, pd.Period):
        return s.asfreq('Q-DEC')
    s = str(s).strip().upper().replace('-', '').replace(' ', '')
    if 'Q' in s:
        y, q = s.split('Q', 1)
        return pd.Period(year=int(y), quarter=int(q), freq='Q-DEC')
    try:
        dt = pd.to_datetime(s, errors='raise')
        return dt.to_period('Q-DEC')
    except Exception:
        return pd.NaT

In [130]:
def _keys(df):
    """
    For quarterly pulls:
      - expects columns ['gvkey', 'YearQuarter_prior'] (e.g., '2024Q3')
      - returns distinct gvkey + quarter keys, also provides 'year' and 'quarter'
    """
    out = (df[['gvkey', 'YearQuarter_prior']].dropna()
           .assign(
        gvkey=lambda d: d['gvkey'].astype(str).str.replace(r'\.0$', '', regex=True).str.zfill(6),
        qtr=lambda d: d['YearQuarter_prior'].map(_parse_yearquarter))
           .dropna(subset=['qtr'])
           .drop_duplicates())
    out['year'] = out['qtr'].dt.year.astype(int)
    out['quarter'] = out['qtr'].dt.quarter.astype(int)
    return out[['gvkey', 'qtr', 'year', 'quarter']]

In [131]:
def _pull_compustat_quarterly(db, keys):
    """
    Pull ONLY the variables needed for:
      - Market cap / Size: prccq, cshoq
      - ROA: ibq, atq
      - Leverage: ltq, atq
      - Book-to-Market: seqq, ceqq, pstkq, txditcq

    Returns compq with calendar quarter fields and a normalized join key (gvkey_norm).
    """


    if keys.empty:
        return pd.DataFrame(columns=[
            'gvkey', 'datadate', 'prccq', 'cshoq', 'ibq', 'atq', 'ltq', 'seqq', 'ceqq', 'pstkq', 'txditcq',
            'calyear', 'calqtr', 'calyrqtr', 'gvkey_norm'
        ])

    # window: from (min quarter - 1) to max quarter (safe & small)
    min_q = (keys['qtr'].min() - 1)
    max_q = keys['qtr'].max()
    start_date = min_q.asfreq('Q-DEC').to_timestamp(how='end').normalize()
    end_date = max_q.asfreq('Q-DEC').to_timestamp(how='end').normalize()

    gvkeys = tuple(keys['gvkey'].unique().tolist())  # already normalized in _keys()

    compq = db.raw_sql("""
        SELECT gvkey, datadate,
               prccq, cshoq,           -- market cap
               ibq, atq,               -- ROA
               ltq,                    -- leverage
               seqq, ceqq, pstkq, txditcq  -- book equity pieces
        FROM comp.fundq
        WHERE indfmt='INDL' AND datafmt='STD' AND consol='C'
          AND gvkey IN %(g)s
          AND datadate BETWEEN %(a)s AND %(b)s
    """, params={
        "g": gvkeys,
        "a": pd.Timestamp(start_date),
        "b": pd.Timestamp(end_date)
    }, date_cols=['datadate'])

    if compq.empty:
        return pd.DataFrame(columns=[
            'gvkey', 'datadate', 'prccq', 'cshoq', 'ibq', 'atq', 'ltq', 'seqq', 'ceqq', 'pstkq', 'txditcq',
            'calyear', 'calqtr', 'calyrqtr', 'gvkey_norm'
        ])

    # calendar fields + normalized gvkey (for joining)
    compq['datadate'] = pd.to_datetime(compq['datadate'])
    compq['calyear'] = compq['datadate'].dt.year.astype('Int64')
    compq['calqtr'] = compq['datadate'].dt.quarter.astype('Int64')
    compq['calyrqtr'] = compq['calyear'].astype(str) + 'Q' + compq['calqtr'].astype(str)
    compq['gvkey_norm'] = compq['gvkey'].astype(str).str.replace(r'\.0$', '', regex=True).str.zfill(6)
    return compq

In [132]:
# --- collapse comp.fundq to one row per (gvkey, quarter) and compute ratios ---
def _compute_quarterly_metrics(compq: pd.DataFrame) -> pd.DataFrame:
    if compq.empty:
        return pd.DataFrame(columns=['gvkey_norm', 'calyrqtr', 'size', 'roa_q', 'lev_q', 'bm_q'])

    compq = (compq.sort_values(['gvkey_norm', 'calyrqtr', 'datadate'])
             .drop_duplicates(['gvkey_norm', 'calyrqtr'], keep='last')).copy()

    # to numeric Series
    prccq = pd.to_numeric(compq['prccq'], errors='coerce')
    cshoq = pd.to_numeric(compq['cshoq'], errors='coerce')
    ibq = pd.to_numeric(compq['ibq'], errors='coerce')
    atq = pd.to_numeric(compq['atq'], errors='coerce')
    ltq = pd.to_numeric(compq['ltq'], errors='coerce')
    seqq = pd.to_numeric(compq['seqq'], errors='coerce')
    ceqq = pd.to_numeric(compq['ceqq'], errors='coerce')
    pstkq = pd.to_numeric(compq['pstkq'], errors='coerce')
    txditcq = pd.to_numeric(compq['txditcq'], errors='coerce')

    # numpy views (avoid masked ufunc path)
    prc = prccq.to_numpy(dtype='float64')
    csho = cshoq.to_numpy(dtype='float64')
    ib = ibq.to_numpy(dtype='float64')
    at = atq.to_numpy(dtype='float64')
    lt = ltq.to_numpy(dtype='float64')
    seq = seqq.to_numpy(dtype='float64')
    ceq = ceqq.to_numpy(dtype='float64')
    pstk = pstkq.to_numpy(dtype='float64')
    txd = txditcq.to_numpy(dtype='float64')

    # market cap
    mktcap = prc * csho

    # size = ln(mktcap) only if mktcap > 0
    size = np.where(mktcap > 0, np.log(mktcap), np.nan)

    # ROA and Leverage: safe divisions
    roa_q = np.divide(ib, at, out=np.full_like(at, np.nan), where=(at != 0) & ~np.isnan(at))
    lev_q = np.divide(lt, at, out=np.full_like(at, np.nan), where=(at != 0) & ~np.isnan(at))

    # Book equity: BE = (SEQ if present else CEQ) + TXDITCQ − PSTKQ
    seq_filled = np.where(~np.isnan(seq), seq, ceq)
    be = seq_filled + txd - pstk

    # Book-to-Market: only if mktcap > 0
    bm_q = np.divide(be, mktcap, out=np.full_like(mktcap, np.nan), where=(mktcap > 0) & ~np.isnan(mktcap))

    return pd.DataFrame({
        'gvkey_norm': compq['gvkey_norm'].values,
        'calyrqtr': compq['calyrqtr'].values,
        'size': size,
        'roa_q': roa_q,
        'lev_q': lev_q,
        'bm_q': bm_q
    })

In [133]:
# --- build mapping (gvkey_norm, YearQuarter_prior) -> metrics ---
def _get_quarterly_metrics_map(db, df_quarterly: pd.DataFrame) -> pd.DataFrame:
    # keys with normalized gvkey + Period quarter (used for pull)
    k0 = _keys(df_quarterly)
    if k0.empty:
        return pd.DataFrame(columns=['gvkey_norm', 'YearQuarter_prior', 'size', 'roa_q', 'lev_q', 'bm_q'])

    # pull only once over the window
    compq = _pull_compustat_quarterly(db, k0)

    # compute metrics at (gvkey_norm, calyrqtr)
    metrics = _compute_quarterly_metrics(compq)

    # make left-side merge keys from k0 (string quarter like '2007Q1')
    left_keys = (k0.assign(
        YearQuarter_prior=k0['qtr'].astype(str),
        calyrqtr=k0['qtr'].dt.year.astype(str) + 'Q' + k0['qtr'].dt.quarter.astype(str),
        gvkey_norm=k0['gvkey'])
                 [['gvkey_norm', 'YearQuarter_prior', 'calyrqtr']].drop_duplicates()
                 )

    # map metrics to (gvkey_norm, YearQuarter_prior)
    out = left_keys.merge(metrics, on=['gvkey_norm', 'calyrqtr'], how='left')
    return out[['gvkey_norm', 'YearQuarter_prior', 'size', 'roa_q', 'lev_q', 'bm_q']]


In [134]:
# --- add quarterly metrics back to your original dataframe ---
def add_quarterly_metrics(db, reprisk_merged_quarterly: pd.DataFrame) -> pd.DataFrame:
    df = reprisk_merged_quarterly.copy()

    # temp normalized key for joining (same as _keys)
    df['gvkey_norm'] = df['gvkey'].astype(str).str.replace(r'\.0$', '', regex=True).str.zfill(6)

    metrics_map = _get_quarterly_metrics_map(db, df)

    out = df.merge(metrics_map,
                   on=['gvkey_norm', 'YearQuarter_prior'],
                   how='left')

    return out.drop(columns=['gvkey_norm'])

In [135]:
reprisk_with_metrics = add_quarterly_metrics(db, reprisk_merged_yearly)
reprisk_with_metrics

  size = np.where(mktcap > 0, np.log(mktcap), np.nan)


Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,industry_n_car_1_material_24m,industry_n_car_5_material_24m,industry_n_reach_material_24m,industry_n_severity_material_24m,YearQuarter,YearQuarter_prior,size,roa_q,lev_q,bm_q
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,0,0,5,5,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,2,2,12,12,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,3,3,18,18,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,3,3,25,25,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,4,4,33,34,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390467,356128.0,2023-08,0,0,0,0,0,0,0,0,...,49,49,164,164,2023Q3,2023Q2,,,,
2390468,356128.0,2023-09,0,0,0,0,0,0,0,0,...,48,47,168,168,2023Q3,2023Q2,,,,
2390469,356128.0,2023-10,0,0,0,0,0,0,0,0,...,47,46,171,171,2023Q4,2023Q3,9.846527,0.034406,0.833953,
2390470,356128.0,2023-11,0,0,0,0,0,0,0,0,...,48,46,169,169,2023Q4,2023Q3,9.846527,0.034406,0.833953,


In [136]:
reprisk_with_metrics.to_csv('data/incidents_rolling_industry_all.csv', index=False)

In [33]:
reprisk_with_metrics.isna().sum()

cusip                                   0
incident_date                           0
date                                    0
CAR_0_1                                 0
severity                         17124977
reach                            17124977
novelty                          17124977
material_flag                           0
materiality_car_5                       0
materiality_reach                       0
materiality_car_1                       0
materiality_severity                    0
incident                                0
gvkey                                   0
SICS Codified Industry                  0
Codified SICS Sector                    0
industry_material_flag                  0
industry_materiality_car_5              0
industry_materiality_reach              0
industry_materiality_car_1              0
industry_materiality_severity           0
industry_incident                       0
MKT_2d                               3931
SMB_2d                            

# 3. Calculate short term market reaction

In [4]:
final = pd.read_csv('data/incidents_rolling_firm_all.csv')
final['cusip'] = final['cusip'].astype(str).str.strip().str[:8].str.upper()
final

Unnamed: 0,gvkey,isin,cusip,reprisk_id,story_id,adopter,sic,SICS Codified Industry,Codified SICS Sector,severity,...,car_5_roll12,materiality_reach,materiality_severity,materiality_car_1,materiality_car_5,YearQuarter_prior,size,roa_q,lev_q,bm_q
0,210418.0,US0003752047,00037520,2,1251.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,0.016264,0,0,1,1,2007Q1,,0.020228,0.706106,
1,210418.0,US0003752047,00037520,2,1305.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,0.016264,0,0,0,0,2007Q2,,0.026238,0.700164,
2,210418.0,US0003752047,00037520,2,4029.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,-0.282756,0,1,0,1,2008Q1,11.042192,0.029648,0.623712,0.199570
3,210418.0,US0003752047,00037520,2,4488.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,-0.250865,1,1,0,0,2008Q1,11.042192,0.029648,0.623712,0.199570
4,210418.0,US0003752047,00037520,2,4756.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,-0.007518,1,1,0,0,2006Q4,10.575869,0.019768,0.741906,0.173746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331365,355240.0,THB131010001,NAN,2683218,,1,5900.0,Coal Operations,Extractives & Minerals Processing,0.0,...,,0,0,0,0,NaT,,,,
331366,349631.0,PK0126301016,NAN,2684126,,1,3411.0,Containers & Packaging,Resource Transformation,0.0,...,,0,0,0,0,NaT,,,,
331367,350690.0,GB00BLNNFY18,NAN,2685085,,1,6726.0,Household & Personal Products,Consumer Goods,0.0,...,,0,0,0,0,NaT,,,,
331368,350726.0,AU0000180200,NAN,2685460,,1,1400.0,Chemicals,Resource Transformation,0.0,...,,0,0,0,0,NaT,,,,


In [53]:
import wrds
from pathlib import Path
from dotenv import load_dotenv
import os
from itertools import islice

project_root = Path(r"E:\GermanBusinessPanelTeam\Schrader\Forschung\ESGmateriality")

# ── Load WRDS creds from wrds.env in the project root ─────────────────────────
env_path = project_root / "wrds.env"
load_dotenv(dotenv_path=env_path)
# ── Connect to WRDS ───────────────────────────────────────────────────────────
db = wrds.Connection(
    wrds_username=os.getenv("WRDS_YALE_USERNAME"),
    wrds_password=os.getenv("WRDS_YALE_PASSWORD")
)

Loading library list...
Done


In [60]:
# --- 1) Unique CUSIPs and date bounds (work on copies; your df_incidents stays untouched)
cusips = final['cusip'].astype(str).str.strip().str[:8].str.upper().dropna().unique().tolist()

dates = pd.to_datetime(pd.Series(final['incident_date'].dropna()), errors='coerce').dropna()
start_date = dates.min().strftime('%Y-%m-%d')
end_date = (dates.max() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')  # include +1 day


# --- 2) Query CRSP.dsf in manageable chunks
def chunks(seq, size):
    it = iter(seq)
    while True:
        block = list(islice(it, size))
        if not block:
            break
        yield block


dfs = []
for block in chunks(cusips, 2000):  # adjust chunk size if needed
    cusip_list_sql = "(" + ",".join(f"'{c}'" for c in block) + ")"
    sql = f"""
        SELECT cusip, date, ret
        FROM crsp.dsf
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
          AND cusip IN {cusip_list_sql}
    """
    dfs.append(db.raw_sql(sql, date_cols=['date']))


# --- 3) Concatenate & tidy
all_returns = (
    pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=['cusip', 'date', 'ret'])
    .astype({'cusip': 'object'})
)
all_returns['ret'] = pd.to_numeric(all_returns['ret'], errors='coerce')
all_returns = all_returns.sort_values(['cusip', 'date']).reset_index(drop=True)
all_returns

Unnamed: 0,cusip,date,ret
0,00030710,2014-10-02,
1,00030710,2014-10-03,0.008108
2,00030710,2014-10-06,0.031635
3,00030710,2014-10-07,0.0
4,00030710,2014-10-08,-0.005717
...,...,...,...
17481590,U7260311,2017-12-13,0.0
17481591,U7260311,2017-12-14,0.0
17481592,U7260311,2017-12-15,0.0
17481593,U7260311,2017-12-18,-0.002222


In [61]:
def fill_and_bfill(g):
    # full daily calendar from first to last observed date
    full_idx = pd.date_range(g['date'].min(), g['date'].max(), freq='D')
    g2 = (g.set_index('date')
          .reindex(full_idx)  # adds missing days as NaNs
          .rename_axis('date'))
    # keep cusip, then backfill ret (use next trading day's value)
    g2['cusip'] = g2['cusip'].ffill().bfill()
    g2['ret'] = g2['ret'].bfill()
    return g2.reset_index()

In [62]:
df_filled = (all_returns.groupby('cusip', group_keys=False)
             .apply(fill_and_bfill)
             .sort_values(['cusip', 'date'])
             .reset_index(drop=True))

  .apply(fill_and_bfill)


In [65]:
# final must have: 'cusip', 'incident_date'
cusips = pd.Series(final['cusip'].dropna().unique()).astype(str)

d = pd.to_datetime(final['incident_date'], errors='coerce').dropna()
dates = pd.Index(d.dt.normalize().unique()).sort_values()  # unique YYYY-MM-DD (midnight)

pairs = pd.MultiIndex.from_product([cusips, dates], names=['cusip', 'incident_date']).to_frame(index=False)

In [68]:
# Date window: [min(incident_date), max(incident_date)+1 day] (add a small buffer if you like)
start_date = dates.min().strftime('%Y-%m-%d')
end_date = (dates.max() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')

# Single query version (simple). If your CUSIP list is huge and SQL IN is too long, split manually.
cusip_list_sql = "(" + ",".join(f"'{c}'" for c in cusips.tolist()) + ")"

sql_dsf = f"""
    SELECT cusip, date, ret
    FROM crsp.dsf
    WHERE date BETWEEN '{start_date}' AND '{end_date}'
      AND cusip IN {cusip_list_sql}
"""
dsf = db.raw_sql(sql_dsf, date_cols=['date'])

# Tidy
dsf['cusip'] = dsf['cusip'].astype(str)
dsf['ret'] = pd.to_numeric(dsf['ret'], errors='coerce')
dsf = dsf.dropna(subset=['cusip', 'date']).sort_values(['cusip', 'date']).reset_index(drop=True)

dsf.head()

Unnamed: 0,cusip,date,ret
0,30710,2014-10-02,
1,30710,2014-10-03,0.008108
2,30710,2014-10-06,0.031635
3,30710,2014-10-07,0.0
4,30710,2014-10-08,-0.005717


In [69]:
sql_mkt = f"""
    SELECT date, vwretd
    FROM crsp.dsi
    WHERE date BETWEEN '{start_date}' AND '{end_date}'
"""
mkt = db.raw_sql(sql_mkt, date_cols=['date'])
mkt['vwretd'] = pd.to_numeric(mkt['vwretd'], errors='coerce')

In [70]:
dsf_ar = dsf.merge(mkt, on='date', how='left')
dsf_ar['ar'] = dsf_ar['ret'] - dsf_ar['vwretd']

In [71]:
# For Day 1 we’ll need the next trading day’s AR; compute it via groupby().shift(-1)
dsf_ar = dsf_ar.sort_values(['cusip', 'date']).reset_index(drop=True)
dsf_ar['next_ar'] = dsf_ar.groupby('cusip')['ar'].shift(-1)
dsf_ar['next_date'] = dsf_ar.groupby('cusip')['date'].shift(-1)

dsf_ar[['cusip', 'date', 'ar', 'next_date', 'next_ar']].head()

Unnamed: 0,cusip,date,ar,next_date,next_ar
0,30710,2014-10-02,,2014-10-03,-0.000988
1,30710,2014-10-03,-0.000988,2014-10-06,0.033217
2,30710,2014-10-06,0.033217,2014-10-07,0.01499
3,30710,2014-10-07,0.01499,2014-10-08,-0.021572
4,30710,2014-10-08,-0.021572,2014-10-09,0.027493


In [72]:
# df has: ['cusip','date','ar','next_date','next_ar']
df = dsf_ar.copy()
df['CAR_0_1'] = df['ar'] + df['next_ar']  # stays NaN if either side is NaN
df

Unnamed: 0,cusip,date,ret,vwretd,ar,next_ar,next_date,CAR_0_1
0,00030710,2014-10-02,,0.000775,,-0.000988,2014-10-03,
1,00030710,2014-10-03,0.008108,0.009096,-0.000988,0.033217,2014-10-06,0.032229
2,00030710,2014-10-06,0.031635,-0.001582,0.033217,0.01499,2014-10-07,0.048207
3,00030710,2014-10-07,0.0,-0.01499,0.01499,-0.021572,2014-10-08,-0.006582
4,00030710,2014-10-08,-0.005717,0.015855,-0.021572,0.027493,2014-10-09,0.005921
...,...,...,...,...,...,...,...,...
17481590,U7260311,2017-12-13,0.0,0.000541,-0.000541,0.004374,2017-12-14,0.003833
17481591,U7260311,2017-12-14,0.0,-0.004374,0.004374,-0.007951,2017-12-15,-0.003577
17481592,U7260311,2017-12-15,0.0,0.007951,-0.007951,-0.008971,2017-12-18,-0.016922
17481593,U7260311,2017-12-18,-0.002222,0.006749,-0.008971,0.003411,2017-12-19,-0.00556


In [73]:
pairs_merged = pairs.merge(df[['cusip', 'date', 'CAR_0_1']],
                           left_on=['cusip', 'incident_date'],
                           right_on=['cusip', 'date'],
                           how='inner')
pairs_merged

Unnamed: 0,cusip,incident_date,date,CAR_0_1
0,00037520,2007-01-03,2007-01-03,-0.040097
1,00037520,2007-01-04,2007-01-04,-0.04379
2,00037520,2007-01-05,2007-01-05,0.002126
3,00037520,2007-01-08,2007-01-08,0.009886
4,00037520,2007-01-09,2007-01-09,-0.00442
...,...,...,...,...
17445915,15096410,2023-12-22,2023-12-22,0.026127
17445916,15096410,2023-12-26,2023-12-26,0.020036
17445917,15096410,2023-12-27,2023-12-27,-0.008414
17445918,15096410,2023-12-28,2023-12-28,-0.048472


In [75]:
final.drop_duplicates(subset=['cusip', 'incident_date'], inplace=True)
final['incident_date'] = pd.to_datetime(final['incident_date'], errors='coerce')
pairs_incidents = pairs_merged.merge(final[['cusip', 'incident_date', 'severity', 'reach', 'novelty', 'material_flag',
                                            'materiality_car_5', 'materiality_reach', 'materiality_car_1',
                                            'materiality_severity']], on=['cusip', 'incident_date'], how='left')
pairs_incidents

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,materiality_car_1,materiality_severity
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,,,,,
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,,,,,
2,00037520,2007-01-05,2007-01-05,0.002126,,,,,,,,
3,00037520,2007-01-08,2007-01-08,0.009886,,,,,,,,
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
17445915,15096410,2023-12-22,2023-12-22,0.026127,,,,,,,,
17445916,15096410,2023-12-26,2023-12-26,0.020036,,,,,,,,
17445917,15096410,2023-12-27,2023-12-27,-0.008414,,,,,,,,
17445918,15096410,2023-12-28,2023-12-28,-0.048472,,,,,,,,


In [76]:
# Indicator column if material_flag is not na
pairs_incidents['incident'] = np.where(pairs_incidents['material_flag'].notna(), 1, 0)

In [83]:
# Merge gvkey
reprisk_incidents['cusip'] = reprisk_incidents['cusip'].astype(str).str.strip().str[:8].str.upper()
pairs_industry = pairs_incidents.merge(reprisk_incidents[['cusip', 'gvkey']].drop_duplicates(), on='cusip', how='left')
# Merge Industry and Sector
pairs_industry = pairs_industry.merge(
    reprisk_incidents[['cusip', 'SICS Codified Industry ', 'Codified SICS Sector ']].drop_duplicates(), on='cusip',
    how='left')
pairs_industry.notna().sum()
pairs_industry.dropna(subset=['SICS Codified Industry ', 'Codified SICS Sector ', 'CAR_0_1'], inplace=True)
pairs_industry

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,materiality_car_1,materiality_severity,incident,gvkey,SICS Codified Industry,Codified SICS Sector
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,,,,,,0,210418.0,Electrical & Electronic Equipment,Resource Transformation
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,,,,,,0,210418.0,Electrical & Electronic Equipment,Resource Transformation
2,00037520,2007-01-05,2007-01-05,0.002126,,,,,,,,,0,210418.0,Electrical & Electronic Equipment,Resource Transformation
3,00037520,2007-01-08,2007-01-08,0.009886,,,,,,,,,0,210418.0,Electrical & Electronic Equipment,Resource Transformation
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,,,,,,0,210418.0,Electrical & Electronic Equipment,Resource Transformation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17445914,15096410,2023-12-21,2023-12-21,0.028607,,,,,,,,,0,33706.0,Industrial Machinery & Goods,Resource Transformation
17445915,15096410,2023-12-22,2023-12-22,0.026127,,,,,,,,,0,33706.0,Industrial Machinery & Goods,Resource Transformation
17445916,15096410,2023-12-26,2023-12-26,0.020036,,,,,,,,,0,33706.0,Industrial Machinery & Goods,Resource Transformation
17445917,15096410,2023-12-27,2023-12-27,-0.008414,,,,,,,,,0,33706.0,Industrial Machinery & Goods,Resource Transformation


In [84]:
# Create indicator column if material incident in same SICS Industry
df = pairs_industry.copy()

mat_cols = ['material_flag', 'materiality_car_5', 'materiality_reach',
            'materiality_car_1', 'materiality_severity', 'incident']

# 1) Clean to 0/1 at firm-day (NaN -> 0; any nonzero -> 1)
tmp = df[['SICS Codified Industry ', 'date'] + mat_cols].copy()
for c in mat_cols:
    tmp[c] = (pd.to_numeric(tmp[c], errors='coerce').fillna(0) != 0).astype(int)

# 2) Industry-date flags: any firm in industry had an incident today under each definition
ind_flags = (
    tmp.groupby(['SICS Codified Industry ', 'date'])[mat_cols]
    .max()
    .reset_index()
    .rename(columns={c: f'industry_{c}' for c in mat_cols})
)

# 3) Merge back to firm-day
df = df.merge(ind_flags, on=['SICS Codified Industry ', 'date'], how='left')

df

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,incident,gvkey,SICS Codified Industry,Codified SICS Sector,industry_material_flag,industry_materiality_car_5,industry_materiality_reach,industry_materiality_car_1,industry_materiality_severity,industry_incident
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,,,,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,,,,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
2,00037520,2007-01-05,2007-01-05,0.002126,,,,,,,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
3,00037520,2007-01-08,2007-01-08,0.009886,,,,,,,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,,,,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,,,,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,,,,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,,,,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,0,0,0,0,0
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,,,,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1


In [85]:
# Fill material columns NaNs with 0 (no incident -> no material incident)
# select only columns that already start with 'materiality_'
mat_cols = [c for c in df.columns if c.startswith('materiality_')]
if 'material_flag' in df.columns:
    mat_cols.append('material_flag')

# coerce to numeric, fill NAs with 0, cast to int
df[mat_cols] = (
    df[mat_cols]
    .apply(lambda s: pd.to_numeric(s, errors='coerce'))
    .fillna(0)
    .astype(int)
)
df

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,incident,gvkey,SICS Codified Industry,Codified SICS Sector,industry_material_flag,industry_materiality_car_5,industry_materiality_reach,industry_materiality_car_1,industry_materiality_severity,industry_incident
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,0,0,0,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,0,0,0,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
2,00037520,2007-01-05,2007-01-05,0.002126,,,,0,0,0,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
3,00037520,2007-01-08,2007-01-08,0.009886,,,,0,0,0,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,0,0,0,...,0,210418.0,Electrical & Electronic Equipment,Resource Transformation,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,0,0,0,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,0,0,0,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,0,0,0,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,0,0,0,0,0
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,0,0,0,...,0,33706.0,Industrial Machinery & Goods,Resource Transformation,0,1,1,1,1,1


In [87]:
def fetch_ff3_mom_daily(db, start_date, end_date):
    """Return daily FF3+RF+UMD (all in decimals) for [start_date, end_date]."""
    sql_ff = f"""
        SELECT date, mktrf, smb, hml, rf, umd
        FROM ff.factors_daily
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
    """
    ff = db.raw_sql(sql_ff, date_cols=['date']).sort_values('date')

    # Convert % to decimals
    for c in ['mktrf', 'smb', 'hml', 'rf', 'umd']:
        ff[c] = pd.to_numeric(ff[c], errors='coerce') / 100.0

    return ff


# ---- Example usage with your event list df_events ----
evt = df.copy()  # your table above
evt['date'] = pd.to_datetime(evt['date'])

start_date = evt['date'].min()
end_date = evt['date'].max()

factors = fetch_ff3_mom_daily(db, start_date, end_date)
factors

Unnamed: 0,date,mktrf,smb,hml,rf,umd
0,2007-01-03,-0.000005,0.000003,0.00002,0.000002,-0.000045
1,2007-01-04,0.000016,0.000023,-0.000051,0.000002,-0.000056
2,2007-01-05,-0.000073,-0.000094,-0.000029,0.000002,0.000005
3,2007-01-08,0.000024,-0.000009,0.000004,0.000002,0.000032
4,2007-01-09,0.0,0.000029,-0.000031,0.000002,0.000025
...,...,...,...,...,...,...
4272,2023-12-21,0.000109,0.000074,-0.000004,0.000002,-0.00002
4273,2023-12-22,0.00002,0.000064,0.00001,0.000002,-0.000048
4274,2023-12-26,0.000048,0.00007,0.000044,0.000002,-0.000021
4275,2023-12-27,0.000016,0.000013,0.000011,0.000002,0.000011


In [88]:
# 2) Build next-day factors and 2-day sums (t + t+1 trading day)
sum_cols = ['mktrf', 'smb', 'hml', 'rf', 'umd']

for c in sum_cols:
    factors[f'next_{c}'] = factors[c].shift(-1)

ff_2d = factors.assign(
    MKT_2d=factors['mktrf'] + factors['next_mktrf'],
    SMB_2d=factors['smb'] + factors['next_smb'],
    HML_2d=factors['hml'] + factors['next_hml'],
    RF_2d=factors['rf'] + factors['next_rf'],
    UMD_2d=factors['umd'] + factors['next_umd']
).dropna(subset=['next_mktrf'])  # drop last day (no t+1)

keep_cols = ['date', 'MKT_2d', 'SMB_2d', 'HML_2d', 'RF_2d', 'UMD_2d']
ff_2d = ff_2d[keep_cols]
ff_2d

Unnamed: 0,date,MKT_2d,SMB_2d,HML_2d,RF_2d,UMD_2d
0,2007-01-03,0.000011,0.000026,-0.000031,0.000004,-0.000101
1,2007-01-04,-0.000057,-0.000071,-0.00008,0.000004,-0.000051
2,2007-01-05,-0.000049,-0.000103,-0.000025,0.000004,0.000037
3,2007-01-08,0.000024,0.00002,-0.000027,0.000004,0.000057
4,2007-01-09,0.000023,0.000019,-0.00005,0.000004,0.000028
...,...,...,...,...,...,...
4271,2023-12-20,-0.000046,0.000037,0.000009,0.000004,0.00004
4272,2023-12-21,0.000129,0.000138,0.000006,0.000004,-0.000068
4273,2023-12-22,0.000068,0.000134,0.000054,0.000004,-0.000069
4274,2023-12-26,0.000064,0.000083,0.000055,0.000004,-0.00001


In [89]:
# Merge by date
evt = evt.merge(ff_2d, on='date', how='left')
evt

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,industry_materiality_car_5,industry_materiality_reach,industry_materiality_car_1,industry_materiality_severity,industry_incident,MKT_2d,SMB_2d,HML_2d,RF_2d,UMD_2d
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,0,0,0,...,0,0,0,0,0,0.000011,0.000026,-0.000031,0.000004,-0.000101
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,0,0,0,...,0,0,0,0,0,-0.000057,-0.000071,-0.00008,0.000004,-0.000051
2,00037520,2007-01-05,2007-01-05,0.002126,,,,0,0,0,...,0,0,0,0,0,-0.000049,-0.000103,-0.000025,0.000004,0.000037
3,00037520,2007-01-08,2007-01-08,0.009886,,,,0,0,0,...,0,0,0,0,0,0.000024,0.00002,-0.000027,0.000004,0.000057
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,0,0,0,...,0,0,0,0,0,0.000023,0.000019,-0.00005,0.000004,0.000028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,0,0,0,...,1,1,1,1,1,0.000129,0.000138,0.000006,0.000004,-0.000068
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,0,0,0,...,1,1,1,1,1,0.000068,0.000134,0.000054,0.000004,-0.000069
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,0,0,0,...,0,0,0,0,0,0.000064,0.000083,0.000055,0.000004,-0.00001
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,0,0,0,...,1,1,1,1,1,0.000015,-0.000023,0.000014,0.000004,-0.000037


In [90]:
#evt.drop(columns=['incident_date', 'severity', 'reach', 'novelty'], inplace=True)
evt.to_csv('Output/events_ff3mom_daily.csv', index=False)

## 3.1 Add event indicators

In [5]:
evt = pd.read_csv('Output/events_ff3mom_daily.csv')
reprisk = pd.read_csv('data/incidents_rolling_firm.csv')
evt

  evt = pd.read_csv('Output/events_ff3mom_daily.csv')


Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,industry_materiality_car_5,industry_materiality_reach,industry_materiality_car_1,industry_materiality_severity,industry_incident,MKT_2d,SMB_2d,HML_2d,RF_2d,UMD_2d
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,0,0,0,...,0,0,0,0,0,0.000011,0.000026,-0.000031,0.000004,-0.000101
1,00037520,2007-01-04,2007-01-04,-0.043790,,,,0,0,0,...,0,0,0,0,0,-0.000057,-0.000071,-0.000080,0.000004,-0.000051
2,00037520,2007-01-05,2007-01-05,0.002126,,,,0,0,0,...,0,0,0,0,0,-0.000049,-0.000103,-0.000025,0.000004,0.000037
3,00037520,2007-01-08,2007-01-08,0.009886,,,,0,0,0,...,0,0,0,0,0,0.000024,0.000020,-0.000027,0.000004,0.000057
4,00037520,2007-01-09,2007-01-09,-0.004420,,,,0,0,0,...,0,0,0,0,0,0.000023,0.000019,-0.000050,0.000004,0.000028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,0,0,0,...,1,1,1,1,1,0.000129,0.000138,0.000006,0.000004,-0.000068
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,0,0,0,...,1,1,1,1,1,0.000068,0.000134,0.000054,0.000004,-0.000069
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,0,0,0,...,0,0,0,0,0,0.000064,0.000083,0.000055,0.000004,-0.000010
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,0,0,0,...,1,1,1,1,1,0.000015,-0.000023,0.000014,0.000004,-0.000037


Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,cusip,SICS Codified Industry,...,smb,hml,rf,umd,YearQuarter,YearQuarter_prior,size,roa_q,lev_q,bm_q
0,1004.0,2007-01-01,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,...,0.000009,-0.000068,0.000044,0.000017,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
1,1004.0,2007-02-01,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,...,0.000123,-0.000008,0.000038,-0.000129,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
2,1004.0,2007-03-01,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,...,0.000009,-0.000088,0.000043,0.000258,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
3,1004.0,2007-04-01,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,...,-0.000221,-0.000144,0.000044,-0.000016,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
4,1004.0,2007-05-01,0,0,0,0,0,0,000361105,Industrial Machinery & Goods,...,0.000016,-0.000059,0.000041,-0.000033,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390467,356128.0,2023-08-01,0,0,0,0,0,0,48581R205,Consumer Finance,...,-0.000314,-0.000115,0.000045,0.000378,2023Q3,2023Q2,,,,
2390468,356128.0,2023-09-01,0,0,0,0,0,0,48581R205,Consumer Finance,...,-0.000251,0.000149,0.000043,0.000033,2023Q3,2023Q2,,,,
2390469,356128.0,2023-10-01,0,0,0,0,0,0,48581R205,Consumer Finance,...,-0.000386,0.000020,0.000047,0.000167,2023Q4,2023Q3,9.846527,0.034406,0.833953,
2390470,356128.0,2023-11-01,0,0,0,0,0,0,48581R205,Consumer Finance,...,-0.000010,0.000167,0.000044,0.000256,2023Q4,2023Q3,9.846527,0.034406,0.833953,


In [4]:
# Merge severity and reach back
evt = evt.merge(reprisk[['cusip', 'incident_date', 'severity', 'reach']], left_on=['cusip', 'date'],
                right_on=['cusip', 'incident_date'], how='left')

In [91]:
def add_peer_incident_indicators(
        df: pd.DataFrame,
        industry_col: str = "SICS Codified Industry ",
        date_col: str = "date",
        firm_col: str = "cusip",
):
    out = df.copy()

    # Ensure types
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    # Coerce severity/reach to numeric (handles strings/NaNs)
    out["severity"] = pd.to_numeric(out.get("severity"), errors="coerce")
    out["reach"] = pd.to_numeric(out.get("reach"), errors="coerce")

    # Row-level flags for "this row is a severity3 / reach3 incident"
    out["_sev3_self"] = (out["severity"] == 3).astype(int)
    out["_reach3_self"] = (out["reach"] == 3).astype(int)

    # Group by industry & date and count how many severity3/reach3 in the group
    grp_keys = [industry_col, date_col]
    sev3_total = out.groupby(grp_keys)["_sev3_self"].transform("sum")
    reach3_total = out.groupby(grp_keys)["_reach3_self"].transform("sum")

    # Peers = group total minus this row's own flag
    out["peer_severity3"] = (sev3_total - out["_sev3_self"] > 0).astype(int)
    out["peer_reach3"] = (reach3_total - out["_reach3_self"] > 0).astype(int)

    # Optional: if you only want to flag peers within the *same industry* but
    # exclude duplicate rows of the same firm on the same date (rare), this already handles it.

    # Clean up
    out.drop(columns=["_sev3_self", "_reach3_self"], inplace=True)
    return out


# Example:
df2 = add_peer_incident_indicators(evt)

In [92]:
def add_peer_lowcar_flags(
    df: pd.DataFrame,
    industry_col: str = "SICS Codified Industry",  # <- ensure exact name (no trailing space)
    date_col: str = "date",
    firm_col: str = "cusip",
    car_col: str = "CAR_0_1",
    incident_col: str = "incident",
):
    """
    Flags whether there exists at least one *peer* (same industry & date, different firm)
    whose incident CAR_0_1 falls in the BOTTOM quintile/decile:
      - bottom_quintile_industry: <= industry 20th pct (computed over all incidents in that industry)
      - bottom_decile_industry:   <= industry 10th pct
      - bottom_quintile:          <= global 20th pct (across all incidents)
      - bottom_decile:            <= global 10th pct
    """

    out = df.copy()

    # Types
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out[car_col]  = pd.to_numeric(out[car_col], errors="coerce")
    out[incident_col] = pd.to_numeric(out[incident_col], errors="coerce").fillna(0).astype(int)

    # Incident rows only for computing cutoffs
    inc_mask = (out[incident_col] == 1)
    inc = out.loc[inc_mask, [industry_col, date_col, firm_col, car_col]].copy()

    # If no incidents -> zeros
    if inc.empty:
        out["bottom_quintile_industry"] = 0
        out["bottom_decile_industry"]   = 0
        out["bottom_quintile"]          = 0
        out["bottom_decile"]            = 0
        return out

    # ===== Quantile cutoffs among incidents (lower tail) =====
    q_ind = (
        inc.groupby(industry_col)[car_col]
           .quantile([0.2, 0.1])   # 20th, 10th percentiles
           .unstack()
           .rename(columns={0.2: "q20_ind", 0.1: "q10_ind"})
    )
    inc = inc.join(q_ind, on=industry_col)

    # Global (across all incidents)
    q20_global = inc[car_col].quantile(0.2)
    q10_global = inc[car_col].quantile(0.1)

    # Flags for whether THIS incident is in bottom tail
    inc["_is_bot_q_ind"]  = (inc[car_col] <= inc["q20_ind"]).astype(int)
    inc["_is_bot_d_ind"]  = (inc[car_col] <= inc["q10_ind"]).astype(int)
    inc["_is_bot_q_glob"] = (inc[car_col] <= q20_global).astype(int)
    inc["_is_bot_d_glob"] = (inc[car_col] <= q10_global).astype(int)

    # Industries with too few incidents → NaN cutoffs → treat as not bottom
    for col in ["_is_bot_q_ind", "_is_bot_d_ind"]:
        inc[col] = inc[col].fillna(0).astype(int)

    # ===== Daily counts per (industry, date) =====
    grp_keys = [industry_col, date_col]
    daily_bot = (
        inc.groupby(grp_keys)[["_is_bot_q_ind","_is_bot_d_ind","_is_bot_q_glob","_is_bot_d_glob"]]
           .sum()
           .rename(columns={
               "_is_bot_q_ind":  "_bot_q_ind_cnt",
               "_is_bot_d_ind":  "_bot_d_ind_cnt",
               "_is_bot_q_glob": "_bot_q_glob_cnt",
               "_is_bot_d_glob": "_bot_d_glob_cnt",
           })
           .reset_index()
    )

    # Merge daily counts back to all rows
    out = out.merge(daily_bot, on=grp_keys, how="left")
    for col in ["_bot_q_ind_cnt","_bot_d_ind_cnt","_bot_q_glob_cnt","_bot_d_glob_cnt"]:
        out[col] = out[col].fillna(0).astype(int)

    # Merge self bottom flags to exclude the focal firm from "peer" logic
    self_flags = inc[[industry_col, date_col, firm_col,
                      "_is_bot_q_ind","_is_bot_d_ind","_is_bot_q_glob","_is_bot_d_glob"]]
    out = out.merge(self_flags, on=[industry_col, date_col, firm_col], how="left")
    for col in ["_is_bot_q_ind","_is_bot_d_ind","_is_bot_q_glob","_is_bot_d_glob"]:
        out[col] = out[col].fillna(0).astype(int)

    # ===== Peer flags (existence of at least one other firm in bottom tail) =====
    out["bottom_quintile_industry"] = ((out["_bot_q_ind_cnt"]  - out["_is_bot_q_ind"])  > 0).astype(int)
    out["bottom_decile_industry"]   = ((out["_bot_d_ind_cnt"]  - out["_is_bot_d_ind"])  > 0).astype(int)
    out["bottom_quintile"]          = ((out["_bot_q_glob_cnt"] - out["_is_bot_q_glob"]) > 0).astype(int)
    out["bottom_decile"]            = ((out["_bot_d_glob_cnt"] - out["_is_bot_d_glob"]) > 0).astype(int)

    # Cleanup helpers
    out.drop(columns=[
        "_bot_q_ind_cnt","_bot_d_ind_cnt","_bot_q_glob_cnt","_bot_d_glob_cnt",
        "_is_bot_q_ind","_is_bot_d_ind","_is_bot_q_glob","_is_bot_d_glob"
    ], inplace=True)

    return out

# Example usage:
df = add_peer_lowcar_flags(df2, industry_col="SICS Codified Industry ")
df


Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,SMB_2d,HML_2d,RF_2d,UMD_2d,peer_severity3,peer_reach3,bottom_quintile_industry,bottom_decile_industry,bottom_quintile,bottom_decile
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,0,0,0,...,0.000026,-0.000031,0.000004,-0.000101,0,0,0,0,0,0
1,00037520,2007-01-04,2007-01-04,-0.04379,,,,0,0,0,...,-0.000071,-0.00008,0.000004,-0.000051,0,0,0,0,0,0
2,00037520,2007-01-05,2007-01-05,0.002126,,,,0,0,0,...,-0.000103,-0.000025,0.000004,0.000037,0,0,0,0,0,0
3,00037520,2007-01-08,2007-01-08,0.009886,,,,0,0,0,...,0.00002,-0.000027,0.000004,0.000057,0,0,0,0,0,0
4,00037520,2007-01-09,2007-01-09,-0.00442,,,,0,0,0,...,0.000019,-0.00005,0.000004,0.000028,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,0,0,0,...,0.000138,0.000006,0.000004,-0.000068,0,1,0,0,0,0
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,0,0,0,...,0.000134,0.000054,0.000004,-0.000069,1,1,0,0,0,0
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,0,0,0,...,0.000083,0.000055,0.000004,-0.00001,0,0,0,0,0,0
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,0,0,0,...,-0.000023,0.000014,0.000004,-0.000037,0,1,0,0,0,0


In [93]:
df[(df['bottom_decile_industry'] == 1) & (df['incident'] == 1)]

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,SMB_2d,HML_2d,RF_2d,UMD_2d,peer_severity3,peer_reach3,bottom_quintile_industry,bottom_decile_industry,bottom_quintile,bottom_decile
1433,00037520,2012-09-20,2012-09-20,-0.012642,1.0,2.0,2.0,0,1,1,...,0.000007,-0.000014,0.0,0.000046,0,0,1,1,1,1
3159,00037520,2019-08-02,2019-08-02,0.013212,2.0,1.0,2.0,0,1,0,...,-0.000033,0.000008,0.000002,0.000072,0,0,1,1,1,1
4429,00130H10,2008-04-11,2008-04-11,0.029179,2.0,1.0,1.0,0,0,0,...,-0.00005,0.000017,0.000002,0.000162,0,0,1,1,1,1
5951,00130H10,2014-04-30,2014-04-30,0.024278,2.0,2.0,1.0,0,0,0,...,0.000008,-0.000022,0.0,0.000059,0,1,1,1,1,1
6329,00130H10,2015-10-28,2015-10-28,0.009761,2.0,1.0,1.0,0,0,0,...,0.000061,0.000062,0.0,-0.000122,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17070763,03769M10,2023-06-26,2023-06-26,0.015028,1.0,3.0,2.0,0,0,0,...,0.000014,0.000078,0.000004,-0.00009,0,1,1,1,1,1
17070781,03769M10,2023-07-21,2023-07-21,-0.012407,2.0,3.0,2.0,1,0,1,...,-0.000061,0.00004,0.000004,-0.000125,0,1,1,1,1,1
17076644,00258W10,2022-02-11,2022-02-11,0.021354,1.0,1.0,1.0,0,1,0,...,0.000083,0.000151,0.0,0.000078,0,1,1,1,1,1
17191582,15912K10,2022-07-28,2022-07-28,-0.03526,1.0,3.0,1.0,1,1,1,...,-0.000083,-0.000046,0.0,-0.000082,0,0,1,1,1,1


In [94]:
df.to_csv('Output/events_daily.csv', index=False)

In [34]:
# Add financials

In [36]:
df['YearQuarter'] = pd.to_datetime(df['YearMonth']).dt.to_period('Q')
df['YearQuarter_prior'] = df['YearQuarter'] - 1
df['YearQuarter_prior'] = df['YearQuarter_prior'].astype(str)

TypeError: Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead

In [None]:
import wrds
from pathlib import Path
from dotenv import load_dotenv
import os

project_root = Path(r"E:\GermanBusinessPanelTeam\Schrader\Forschung\ESGmateriality")

# ── Load WRDS creds from wrds.env in the project root ─────────────────────────
env_path = project_root / "wrds.env"
load_dotenv(dotenv_path=env_path)
# ── Connect to WRDS ───────────────────────────────────────────────────────────
db = wrds.Connection(
    wrds_username=os.getenv("WRDS_YALE_USERNAME"),
    wrds_password=os.getenv("WRDS_YALE_PASSWORD")
)

In [15]:
# Function is defined earlier
reprisk_with_metrics = add_quarterly_metrics(db, df)
reprisk_with_metrics

Unnamed: 0,cusip,incident_date,date,CAR_0_1,severity,reach,novelty,material_flag,materiality_car_5,materiality_reach,...,HML_2d,RF_2d,UMD_2d,peer_severity3,peer_reach3,bottom_quintile_industry,bottom_decile_industry,bottom_quintile,bottom_decile,YearQuarter_prior
0,00037520,2007-01-03,2007-01-03,-0.040097,,,,0,0,0,...,-0.000031,0.000004,-0.000101,0,0,0,0,0,0,2006Q4
1,00037520,2007-01-04,2007-01-04,-0.043790,,,,0,0,0,...,-0.000080,0.000004,-0.000051,0,0,0,0,0,0,2006Q4
2,00037520,2007-01-05,2007-01-05,0.002126,,,,0,0,0,...,-0.000025,0.000004,0.000037,0,0,0,0,0,0,2006Q4
3,00037520,2007-01-08,2007-01-08,0.009886,,,,0,0,0,...,-0.000027,0.000004,0.000057,0,0,0,0,0,0,2006Q4
4,00037520,2007-01-09,2007-01-09,-0.004420,,,,0,0,0,...,-0.000050,0.000004,0.000028,0,0,0,0,0,0,2006Q4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211775,15096410,2023-12-21,2023-12-21,0.028607,,,,0,0,0,...,0.000006,0.000004,-0.000068,0,1,0,0,0,0,2023Q3
17211776,15096410,2023-12-22,2023-12-22,0.026127,,,,0,0,0,...,0.000054,0.000004,-0.000069,1,1,0,0,0,0,2023Q3
17211777,15096410,2023-12-26,2023-12-26,0.020036,,,,0,0,0,...,0.000055,0.000004,-0.000010,0,0,0,0,0,0,2023Q3
17211778,15096410,2023-12-27,2023-12-27,-0.008414,,,,0,0,0,...,0.000014,0.000004,-0.000037,0,1,0,0,0,0,2023Q3


# 4. Fama-MacBeth regression

## 4.1. Monthy Returns

In [137]:
project_root = Path(r"E:\GermanBusinessPanelTeam\Schrader\Forschung\ESGmateriality")

# ── Load WRDS creds from wrds.env in the project root ─────────────────────────
env_path = project_root / "wrds.env"
load_dotenv(dotenv_path=env_path)
# ── Connect to WRDS ───────────────────────────────────────────────────────────
db = wrds.Connection(
    wrds_username=os.getenv("WRDS_YALE_USERNAME"),
    wrds_password=os.getenv("WRDS_YALE_PASSWORD")
)
# ---------- PARAMETERS ----------
# set your window
START = "2005-01-01"
END = "2024-12-31"

Loading library list...
Done


In [138]:
from pandas.tseries.offsets import MonthEnd

# ---------- 1) CRSP monthly returns (common stocks) ----------
# ret = total return; prc, shrout used for market equity; we’ll add delist return
sql_msf = f"""
select permno, date, ret, retx, prc, shrout
from crsp.msf
where date between '{START}' and '{END}'
"""
msf = db.raw_sql(sql_msf, date_cols=['date'])
msf['me'] = msf['prc'].abs() * msf['shrout'] * 1000.0

In [139]:
# ---- 2) Names table to get shrcd/exchcd as of date ----
# Use msenames (monthly security names history). Filter to primary share classes (optional).
sql_names = """
select permno, namedt, coalesce(nameendt, '9999-12-31') as nameendt, shrcd, exchcd
from crsp.msenames
"""
names = db.raw_sql(sql_names, date_cols=['namedt', 'nameendt'])

# Merge names onto msf by permno where date is within [namedt, nameendt]
msf = msf.merge(names, on='permno', how='left')
msf = msf[(msf['date'] >= msf['namedt']) & (msf['date'] <= msf['nameendt'])].copy()

# Optional: keep common shares on major exchanges
msf = msf[msf['shrcd'].isin([10, 11]) & msf['exchcd'].isin([1, 2, 3])]

In [140]:
# ---- 3) Add delisting returns and compute adjusted return ----
sql_dl = f"""
select permno, dlstdt as date, dlret
from crsp.msedelist
where dlstdt between '{START}' and '{END}'
"""
dl = db.raw_sql(sql_dl, date_cols=['date'])

# Merge dlret onto the monthly file and build adjusted returns:
# ret_adj = (1 + ret) * (1 + dlret) - 1  (with dlret=0 if missing)
crsp = msf.merge(dl, on=['permno', 'date'], how='left')
crsp['ret_adj'] = (1.0 + crsp['ret'].astype(float)) * (1.0 + crsp['dlret'].fillna(0).astype(float)) - 1.0

In [141]:
# ---- 4) Lagged ME for VW weights (use ME at t-1 for month t) ----
crsp = crsp.sort_values(['permno', 'date'])
crsp['me_lag'] = crsp.groupby('permno')['me'].shift(1)

crsp_panel = crsp[['permno', 'date', 'ret_adj', 'me', 'me_lag', 'shrcd', 'exchcd']].copy()
crsp_panel

Unnamed: 0,permno,date,ret_adj,me,me_lag,shrcd,exchcd
0,10001,2005-01-31,-0.040580,17178900.0,,11,3
4808,10001,2005-02-28,-0.045166,16428279.0,17178900.0,11,3
9621,10001,2005-03-31,0.124822,18663750.0,16428279.0,11,3
14424,10001,2005-04-29,-0.074684,17269875.0,18663750.0,11,3
19223,10001,2005-05-31,0.219030,21052500.0,17269875.0,11,3
...,...,...,...,...,...,...,...
946608,93436,2024-08-30,-0.077391,684004370400.000122,741380136746.400024,11,3
950467,93436,2024-09-30,0.221942,839047410000.0,684004370400.000122,11,3
954312,93436,2024-10-31,-0.045025,802033523100.599976,839047410000.0,11,3
958141,93436,2024-11-29,0.381469,1107984309600.000244,802033523100.599976,11,3


In [142]:
# ---- 5) Fama–French monthly factors ----
ff = db.raw_sql(f"""
select *
from ff.factors_monthly
where date between '{START}' and '{END}'
""", date_cols=['date'])

# Convert to decimals if table is in %
for col in ff.columns:
    if col.lower() != 'date' and pd.api.types.is_numeric_dtype(ff[col]):
        med = ff[col].abs().median(skipna=True)
        if pd.notna(med) and med > 0.5:  # crude but effective
            ff[col] = ff[col] / 100.0

ff = ff.rename(columns={
    'mktrf': 'MKT_RF', 'smb': 'SMB', 'hml': 'HML', 'rf': 'RF',
    'rmw': 'RMW' if 'rmw' in ff.columns else 'RMW',
    'cma': 'CMA' if 'cma' in ff.columns else 'CMA',
    'umd': 'MOM' if 'umd' in ff.columns else 'MOM'
})
ff['date'] = ff['date'] + MonthEnd(0)
ff

Unnamed: 0,date,MKT_RF,SMB,HML,RF,year,month,MOM,dateff
0,2005-01-31,-0.0275,-0.0166,0.0206,0.0016,20.05,0.01,0.0296,2005-01-31
1,2005-02-28,0.0188,-0.0057,0.0141,0.0016,20.05,0.02,0.0343,2005-02-28
2,2005-03-31,-0.0194,-0.0141,0.0207,0.0021,20.05,0.03,0.0043,2005-03-31
3,2005-04-30,-0.0261,-0.0393,0.0005,0.0021,20.05,0.04,-0.0068,2005-04-29
4,2005-05-31,0.0365,0.0286,-0.0058,0.0024,20.05,0.05,0.0037,2005-05-31
...,...,...,...,...,...,...,...,...,...
235,2024-08-31,0.016,-0.0349,-0.011,0.0048,20.24,0.08,0.0481,2024-08-30
236,2024-09-30,0.0172,-0.0013,-0.0277,0.004,20.24,0.09,-0.0062,2024-09-30
237,2024-10-31,-0.01,-0.0099,0.0086,0.0039,20.24,0.1,0.03,2024-10-31
238,2024-11-30,0.0649,0.0446,0.0015,0.004,20.24,0.11,0.01,2024-11-29


In [143]:
# --- pull CUSIP/NCUSIP name history ---
sql_names = """
select
    permno,
    namedt,
    coalesce(nameendt, '9999-12-31') as nameendt,
    cusip,
    ncusip
from crsp.msenames
"""
names = db.raw_sql(sql_names, date_cols=['namedt', 'nameendt'])
names.drop_duplicates(['permno'], keep='last', inplace=True)

In [144]:
# --- align to month-end dates in crsp_panel ---
# merge then filter rows where crsp date is within [namedt, nameendt]
crsp_with_cusip = crsp_panel.merge(names, on='permno', how='left')

## 4.2. Merge Returns, FF4, financials

In [145]:
#----------------------------
# 0) Helpers
# ----------------------------

def choose_primary_class(df, by=['cusip', 'date']):
    """
    If multiple PERMNOs share the same cusip8 at a date,
    keep the one with the largest lagged market cap (me_lag).
    """
    df = df.copy()
    df['_rank'] = df.groupby(by)['me_lag'].rank(method='first', ascending=False)
    out = df.loc[df['_rank'] == 1].drop(columns=['_rank'])
    return out

In [146]:
# ----------------------------
# 1) Prepare CRSP monthly panel
# ----------------------------
crsp = crsp_with_cusip.copy()
crsp['date'] = pd.to_datetime(crsp['date'])

# If same cusip8 maps to multiple permnos per month, keep largest me_lag
crsp_1class = choose_primary_class(crsp, by=['cusip', 'date'])

In [147]:
# ----------------------------
# 2) Prepare factors
# ----------------------------
ff['date'] = pd.to_datetime(ff['date']) + MonthEnd(0)

In [148]:
# For industry data
crsp_with_cusip['month'] = pd.to_datetime(crsp_with_cusip['date']).dt.to_period('M')
crsp_with_cusip['month_prior'] = pd.to_datetime(crsp_with_cusip['date']).dt.to_period('M') - 1
reprisk_with_metrics['YearMonth'] = pd.to_datetime(reprisk_with_metrics['YearMonth']).dt.to_period('M')
reprisk_with_metrics["cusip"] = reprisk_with_metrics["cusip"].astype(str).str.strip().str.upper().str.slice(0, 8)

In [149]:
reprisk_with_metrics

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,industry_n_car_1_material_24m,industry_n_car_5_material_24m,industry_n_reach_material_24m,industry_n_severity_material_24m,YearQuarter,YearQuarter_prior,size,roa_q,lev_q,bm_q
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,0,0,5,5,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,2,2,12,12,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,3,3,18,18,2007Q1,2006Q4,6.889099,0.013968,0.549214,0.495846
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,3,3,25,25,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,4,4,33,34,2007Q2,2007Q1,6.987112,0.015352,0.535340,0.468633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390467,356128.0,2023-08,0,0,0,0,0,0,0,0,...,49,49,164,164,2023Q3,2023Q2,,,,
2390468,356128.0,2023-09,0,0,0,0,0,0,0,0,...,48,47,168,168,2023Q3,2023Q2,,,,
2390469,356128.0,2023-10,0,0,0,0,0,0,0,0,...,47,46,171,171,2023Q4,2023Q3,9.846527,0.034406,0.833953,
2390470,356128.0,2023-11,0,0,0,0,0,0,0,0,...,48,46,169,169,2023Q4,2023Q3,9.846527,0.034406,0.833953,


In [154]:
panel_firms = reprisk_with_metrics.merge(crsp_with_cusip[['permno', 'date', 'ret_adj', 'me', 'me_lag', 'cusip', 'month', 'month_prior']],
                                        left_on=['cusip', 'YearMonth'], right_on=['cusip', 'month_prior'], how='inner')
panel_firms

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,roa_q,lev_q,bm_q,permno,date,ret_adj,me,me_lag,month,month_prior
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,0.013968,0.549214,0.495846,54594,2007-02-28,-0.023834,1075262080.0,1101515040.0,2007-02,2007-01
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,0.013968,0.549214,0.495846,54594,2007-03-30,-0.052270,1026003680.0,1075262080.0,2007-03,2007-02
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,0.013968,0.549214,0.495846,54594,2007-04-30,0.108128,1138958760.0,1026003680.0,2007-04,2007-03
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,0.015352,0.535340,0.468633,54594,2007-05-31,0.064178,1212055000.0,1138958760.0,2007-05,2007-04
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,0.015352,0.535340,0.468633,54594,2007-06-29,0.015692,1231074940.0,1212055000.0,2007-06,2007-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684173,349530.0,2023-08,0,0,0,0,0,0,0,0,...,,,,17324,2023-09-29,-0.223009,5241660.0,6746100.0,2023-09,2023-08
684174,349530.0,2023-09,0,0,0,0,0,0,0,0,...,,,,17324,2023-10-31,-0.259681,3880500.0,5241660.0,2023-10,2023-09
684175,349530.0,2023-10,0,0,0,0,0,0,0,0,...,,,,17324,2023-11-30,-0.369077,2448297.0,3880500.0,2023-11,2023-10
684176,349530.0,2023-11,0,0,0,0,0,0,0,0,...,,,,17324,2023-12-29,4.584003,13671300.0,2448297.0,2023-12,2023-11


In [155]:
ff['dateff'] = pd.to_datetime(ff['dateff'])
panel_firms = panel_firms.merge(ff[['dateff', 'date', 'MKT_RF', 'SMB', 'HML', 'RF', 'MOM']], left_on='date',
                                right_on='dateff', how='left', suffixes=('', '_ff'))
panel_firms

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,me_lag,month,month_prior,dateff,date_ff,MKT_RF,SMB,HML,RF,MOM
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,1101515040.0,2007-02,2007-01,2007-02-28,2007-02-28,-0.0196,0.0123,-0.0008,0.0038,-0.0129
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,1075262080.0,2007-03,2007-02,2007-03-30,2007-03-31,0.0071,0.0009,-0.0088,0.0043,0.0258
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,1026003680.0,2007-04,2007-03,2007-04-30,2007-04-30,0.0349,-0.0221,-0.0144,0.0044,-0.0016
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,1138958760.0,2007-05,2007-04,2007-05-31,2007-05-31,0.0323,0.0016,-0.0059,0.0041,-0.0033
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,1212055000.0,2007-06,2007-05,2007-06-29,2007-06-30,-0.0196,0.0073,-0.0105,0.004,0.0052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684173,349530.0,2023-08,0,0,0,0,0,0,0,0,...,6746100.0,2023-09,2023-08,2023-09-29,2023-09-30,-0.0523,-0.0251,0.0149,0.0043,0.0033
684174,349530.0,2023-09,0,0,0,0,0,0,0,0,...,5241660.0,2023-10,2023-09,2023-10-31,2023-10-31,-0.0315,-0.0386,0.002,0.0047,0.0167
684175,349530.0,2023-10,0,0,0,0,0,0,0,0,...,3880500.0,2023-11,2023-10,2023-11-30,2023-11-30,0.0888,-0.001,0.0167,0.0044,0.0256
684176,349530.0,2023-11,0,0,0,0,0,0,0,0,...,2448297.0,2023-12,2023-11,2023-12-29,2023-12-31,0.0485,0.063,0.0489,0.0043,-0.0553


In [156]:
panel_firms.isna().sum()

gvkey                                    0
YearMonth                                0
n_material                               0
n_nonmaterial                            0
n_car_1_material                         0
n_car_5_material                         0
n_reach_material                         0
n_severity_material                      0
n_material_24m                           0
n_nonmaterial_24m                        0
n_car_1_material_24m                     0
n_car_5_material_24m                     0
n_reach_material_24m                     0
n_severity_material_24m                  0
cusip                                    0
SICS Codified Industry                   0
Codified SICS Sector                     0
year                                     0
industry_n_material_24m                  0
industry_n_nonmaterial_24m               0
industry_n_car_1_material_24m            0
industry_n_car_5_material_24m            0
industry_n_reach_material_24m            0
industry_n_

In [157]:
panel_firms.to_csv('Output/panel_industries.csv', index=False)

# 5. Portfolios rolling sum

In [224]:
df = pd.read_csv('Output/panel_industries.csv')
df = df[df['YearMonth'] > '2006-12']

In [225]:
def add_top_bottom_portfolio_binaries_firm(df: pd.DataFrame,
                                           suffix: str = "_24m") -> pd.DataFrame:
    """
    For every firm-level column named n_*_24m (and NOT industry_n_*_24m), create:
      - top_portfolio_binary_{col}    = 1 if {col} == 0
      - bottom_portfolio_binary_{col} = 1 if {col} > 0
    Returns a copy with the new columns added.
    """
    out = df.copy()
    target_cols = [c for c in out.columns
                   if c.startswith("n_") and not c.startswith("industry_n_") and c.endswith(suffix)]

    for col in target_cols:
        out[f"top_portfolio_binary_{col}"] = (out[col] == 0).astype(int)
        out[f"bottom_portfolio_binary_{col}"] = (out[col] > 0).astype(int)

    return out
df = add_top_bottom_portfolio_binaries_firm(df)

In [226]:
def add_industry_portfolio_binaries(
    df: pd.DataFrame,
    date_col: str = "YearMonth",
    industry_col: str = "SICS Codified Industry ",
    bucket: str = "quintile",              # "quintile" (20%) or "decile" (10%)
    suffix: str = "_24m"
) -> pd.DataFrame:
    """
    For every industry_*_24m column, create:
      - top_portfolio_binary_{col}_{bucket}    = 1 if industry value is in bottom 20% (or 10%)
      - bottom_portfolio_binary_{col}_{bucket} = 1 if industry value is in top   20% (or 10%)

    Percentiles are computed cross-sectionally within each month (date_col),
    across industries (unique (date_col, industry_col) pairs), then mapped back to firms.
    """
    out = df.copy()
    # which columns to process
    target_cols = [c for c in out.columns if c.startswith("industry_") and c.endswith(suffix)]
    if not target_cols:
        return out

    low_q, high_q = (0.2, 0.8) if bucket == "quintile" else (0.1, 0.9)

    for col in target_cols:
        # industry-level panel: one row per (month, industry)
        sub = out[[date_col, industry_col, col]].drop_duplicates([date_col, industry_col])

        # month-specific thresholds from cross-industry distribution
        qs = (
            sub.groupby(date_col)[col]
               .quantile([low_q, high_q])
               .unstack()            # columns are e.g. 0.2 and 0.8 (or 0.1/0.9)
               .rename(columns={low_q: "q_low", high_q: "q_high"})
               .reset_index()
        )

        # attach thresholds, flag top/bottom at industry level
        sub = sub.merge(qs, on=date_col, how="left")
        sub[f"top_portfolio_{col}_{bucket}"]    = (sub[col] <= sub["q_low"]).astype(int)
        sub[f"bottom_portfolio_{col}_{bucket}"] = (sub[col] >= sub["q_high"]).astype(int)
        sub = sub[[date_col, industry_col,
                   f"top_portfolio_{col}_{bucket}",
                   f"bottom_portfolio_{col}_{bucket}"]]

        # map back to firm-level rows
        out = out.merge(sub, on=[date_col, industry_col], how="left")

    return out


In [227]:
# Quintiles (bottom 20% = top portfolio, top 20% = bottom portfolio)
df = add_industry_portfolio_binaries(df, bucket="quintile")

# Or deciles (bottom 10% / top 10%)
df = add_industry_portfolio_binaries(df, bucket="decile")

In [232]:
df

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,top_portfolio_industry_n_nonmaterial_24m_decile,bottom_portfolio_industry_n_nonmaterial_24m_decile,top_portfolio_industry_n_car_1_material_24m_decile,bottom_portfolio_industry_n_car_1_material_24m_decile,top_portfolio_industry_n_car_5_material_24m_decile,bottom_portfolio_industry_n_car_5_material_24m_decile,top_portfolio_industry_n_reach_material_24m_decile,bottom_portfolio_industry_n_reach_material_24m_decile,top_portfolio_industry_n_severity_material_24m_decile,bottom_portfolio_industry_n_severity_material_24m_decile
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,1
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684173,349530.0,2023-08,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1
684174,349530.0,2023-09,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1
684175,349530.0,2023-10,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1
684176,349530.0,2023-11,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1


In [228]:
def summarize_portfolio_counts_per_month(
    df: pd.DataFrame,
    date_col: str = "YearMonth",
    buckets = ("binary", "quintile", "decile")
) -> pd.DataFrame:
    """
    Returns a tidy DataFrame with counts per month of observations in
    top/bottom portfolios for the provided `buckets`.

    Supported naming styles:
      - New:    top_portfolio_{col}_{bucket} / bottom_portfolio_{col}_{bucket}
      - Legacy: top_portfolio_binary_{col}    / bottom_portfolio_binary_{col}

    Output columns:
      [date_col, 'bucket', 'signal', 'side', 'n_obs']
    """
    out_rows = []

    # New-style patterns (bucket at the end)
    patterns = {
        b: {
            "top": re.compile(rf"^top_portfolio_(.+)_{re.escape(b)}$"),
            "bottom": re.compile(rf"^bottom_portfolio_(.+)_{re.escape(b)}$")
        }
        for b in buckets
    }
    # Legacy binary style (bucket at the front)
    pat_legacy_binary_top = re.compile(r"^top_portfolio_binary_(.+)$")
    pat_legacy_binary_bot = re.compile(r"^bottom_portfolio_binary_(.+)$")

    top_cols = [c for c in df.columns if c.startswith("top_portfolio_")]
    bottom_cols = [c for c in df.columns if c.startswith("bottom_portfolio_")]
    port_cols = top_cols + bottom_cols
    if not port_cols:
        return pd.DataFrame(columns=[date_col, "bucket", "signal", "side", "n_obs"])

    for b in buckets:
        col_map = {}

        # 1) New-style columns for this bucket
        for c in port_cols:
            m_top = patterns[b]["top"].match(c)
            m_bot = patterns[b]["bottom"].match(c)
            if m_top:
                signal = m_top.group(1)
                col_map[c] = ("top", signal)
            elif m_bot:
                signal = m_bot.group(1)
                col_map[c] = ("bottom", signal)

        # 2) Legacy binary columns (only when summarizing 'binary')
        if b == "binary":
            for c in port_cols:
                if c in col_map:  # prefer new-style if both exist
                    continue
                m_top = pat_legacy_binary_top.match(c)
                m_bot = pat_legacy_binary_bot.match(c)
                if m_top:
                    signal = m_top.group(1)
                    col_map[c] = ("top", signal)
                elif m_bot:
                    signal = m_bot.group(1)
                    col_map[c] = ("bottom", signal)

        if not col_map:
            continue

        sub = df[[date_col] + list(col_map.keys())].copy()
        long = sub.melt(id_vars=[date_col], var_name="col", value_name="flag")
        long["side"] = long["col"].map(lambda x: col_map[x][0])
        long["signal"] = long["col"].map(lambda x: col_map[x][1])
        long["bucket"] = b

        grp = (
            long[long["flag"] == 1]
            .groupby([date_col, "bucket", "signal", "side"], dropna=False)
            .size()
            .reset_index(name="n_obs")
        )
        out_rows.append(grp)

    if not out_rows:
        return pd.DataFrame(columns=[date_col, "bucket", "signal", "side", "n_obs"])

    res = pd.concat(out_rows, ignore_index=True).sort_values([date_col, "bucket", "signal", "side"])
    return res


In [229]:
counts_long = summarize_portfolio_counts_per_month(df)
counts_long

Unnamed: 0,YearMonth,bucket,signal,side,n_obs
0,2007-01,binary,n_car_1_material_24m,bottom,10
1,2007-01,binary,n_car_1_material_24m,top,3755
2,2007-01,binary,n_car_5_material_24m,bottom,11
3,2007-01,binary,n_car_5_material_24m,top,3754
4,2007-01,binary,n_material_24m,bottom,9
...,...,...,...,...,...
4891,2023-12,quintile,industry_n_nonmaterial_24m,top,147
4892,2023-12,quintile,industry_n_reach_material_24m,bottom,1866
4893,2023-12,quintile,industry_n_reach_material_24m,top,120
4894,2023-12,quintile,industry_n_severity_material_24m,bottom,1866


In [230]:
def ls_excess_from_top_bottom_named(
        df: pd.DataFrame,
        date_col: str = "YearMonth",
        ret_col: str = "ret_adj",
        rf_col: str = "RF",
        weight_col: str = "me_lag",
):
    x = df.copy()

    # Month key -> Period[M]
    if np.issubdtype(x[date_col].dtype, np.datetime64):
        x["_m_"] = x[date_col].dt.to_period("M")
    else:
        x["_m_"] = pd.to_datetime(x[date_col].astype(str), errors="coerce").dt.to_period("M")

    # Excess return and weights
    x["_rex_"] = (
        pd.to_numeric(x.get(ret_col, np.nan), errors="coerce")
        - pd.to_numeric(x.get(rf_col, 0.0), errors="coerce").fillna(0.0)
    )
    x["_w_"] = pd.to_numeric(x.get(weight_col, np.nan), errors="coerce")

    # Support BOTH name orders:
    #   1) <leg>_portfolio_<signal>_<bucket>
    #   2) <leg>_portfolio_<bucket>_<signal>
    pat_end   = re.compile(r"^(?P<leg>top|bottom)_portfolio_(?P<signal>.+)_(?P<bucket>binary|tertile|quintile|decile)$")
    pat_start = re.compile(r"^(?P<leg>top|bottom)_portfolio_(?P<bucket>binary|tertile|quintile|decile)_(?P<signal>.+)$")

    def detect_scope_and_base(sig: str) -> tuple[str, str]:
        """Infer scope from signal prefix, return (scope, base_signal_without_prefix)."""
        if sig.startswith("industry_peers_"):
            return "industry_peers", sig[len("industry_peers_"):]
        if sig.startswith("industry_"):
            return "industry", sig[len("industry_"):]
        return "overall", sig

    # Collect top/bottom pairs by (base_signal, scope, bucket)
    pairs = {}
    for c in x.columns:
        m = pat_end.match(c) or pat_start.match(c)
        if not m:
            continue
        leg = m.group("leg")            # top/bottom
        bucket = m.group("bucket")      # binary/tertile/quintile/decile
        sig_raw = m.group("signal")     # may include industry_/industry_peers_
        scope, sig_base = detect_scope_and_base(sig_raw)
        key = (sig_base, scope, bucket)
        pairs.setdefault(key, {"top": None, "bottom": None})
        pairs[key][leg] = c

    specs = [
        (sig_base, scope, bucket, cols["top"], cols["bottom"])
        for (sig_base, scope, bucket), cols in pairs.items()
        if cols["top"] is not None and cols["bottom"] is not None
    ]
    if not specs:
        raise ValueError("No complete top/bottom pairs found for binary/tertile/quintile/decile.")

    def _clean_signal(sig: str) -> str:
        s = sig
        s = re.sub(r"^n_", "", s)  # drop leading 'n_'
        s = s.replace("_material_24m", "").replace("_material", "")  # your conventions
        s = s.replace("_24m", "")
        return s.strip("_")

    # --- SAFE mask converter ---
    def _safe_bool(mask: pd.Series) -> pd.Series:
        if mask.dtype == bool:
            return mask.astype("boolean").fillna(False)
        m = pd.to_numeric(mask, errors="coerce")
        return m.eq(1).astype("boolean").fillna(False)

    # EW & VW helpers on *excess* returns
    def _ew(mask: pd.Series) -> pd.Series:
        m = _safe_bool(mask)
        if not m.any():
            return pd.Series(dtype=float)
        return x.loc[m].groupby(x.loc[m, "_m_"])["_rex_"].mean()

    def _vw(mask: pd.Series) -> pd.Series:
        m = _safe_bool(mask)
        tmp = x.loc[m, ["_m_", "_rex_", "_w_"]].copy()
        if tmp.empty:
            return pd.Series(dtype=float)
        tmp["_w_"] = tmp["_w_"].clip(lower=0)
        sumw = tmp.groupby("_m_")["_w_"].sum()
        sumw = sumw.where(sumw > 0)
        tmp = tmp.join(sumw.rename("_sumw_"), on="_m_")
        tmp["_contrib_"] = tmp["_rex_"] * (tmp["_w_"] / tmp["_sumw_"])
        return tmp.groupby("_m_")["_contrib_"].sum()

    frames = []
    for sig_base, scope, bucket, top_col, bot_col in sorted(specs):
        top_m = x[top_col]
        bot_m = x[bot_col]

        ew_top, ew_bot = _ew(top_m), _ew(bot_m)
        vw_top, vw_bot = _vw(top_m), _vw(bot_m)

        # align unions of months
        idx = ew_top.index.union(ew_bot.index)
        ew_ls_ex = ew_top.reindex(idx) - ew_bot.reindex(idx)

        idx_vw = vw_top.index.union(vw_bot.index)
        vw_ls_ex = vw_top.reindex(idx_vw) - vw_bot.reindex(idx_vw)
        vw_ls_ex = vw_ls_ex.reindex(idx)

        base = _clean_signal(sig_base)
        if scope == "industry":
            base = f"industry_{base}"
        elif scope == "industry_peers":
            base = f"industry_peers_{base}"

        col_ew = f"{base}_{bucket}_ls_ew"
        col_vw = f"{base}_{bucket}_ls_vw"

        frames.append(pd.DataFrame({
            "YearMonth": idx,
            col_ew: ew_ls_ex.values,
            col_vw: vw_ls_ex.values,
        }))

    out = frames[0]
    for f in frames[1:]:
        out = out.merge(f, on="YearMonth", how="outer")

    return out.sort_values("YearMonth").reset_index(drop=True)


In [231]:
df_ls = ls_excess_from_top_bottom_named(
    df,
    date_col="YearMonth",
    ret_col="ret_adj",
    rf_col="RF",
    weight_col="me_lag",
)
df_ls

Unnamed: 0,YearMonth,industry_car_1_decile_ls_ew,industry_car_1_decile_ls_vw,industry_car_1_quintile_ls_ew,industry_car_1_quintile_ls_vw,car_1_binary_ls_ew,car_1_binary_ls_vw,industry_car_5_decile_ls_ew,industry_car_5_decile_ls_vw,industry_car_5_quintile_ls_ew,...,industry_reach_quintile_ls_ew,industry_reach_quintile_ls_vw,reach_binary_ls_ew,reach_binary_ls_vw,industry_severity_decile_ls_ew,industry_severity_decile_ls_vw,industry_severity_quintile_ls_ew,industry_severity_quintile_ls_vw,severity_binary_ls_ew,severity_binary_ls_vw
0,2007-01,0.007277,0.004943,0.001989,0.003948,0.002154,0.017156,0.007702,0.004118,0.001989,...,0.000677,-0.002288,0.023924,0.011437,0.002231,-0.003835,0.001138,-0.001183,0.022697,0.024004
1,2007-02,0.004171,-0.004006,-0.002864,-0.004930,-0.014613,0.002539,0.003416,0.001363,-0.002864,...,-0.003823,-0.002283,-0.002366,-0.013442,-0.000785,-0.012058,-0.004829,-0.006912,-0.001348,-0.007103
2,2007-03,0.033134,-0.014144,0.004864,-0.022428,-0.008588,-0.005240,0.032126,-0.014725,0.006799,...,0.017414,-0.015108,-0.017186,-0.005925,0.019077,-0.018607,0.016364,-0.017561,-0.017539,-0.010587
3,2007-04,0.027454,0.023282,0.021274,0.018881,-0.010676,0.004426,0.029642,0.029143,0.018580,...,-0.005770,0.003953,-0.014080,-0.002504,-0.001157,0.009323,-0.006397,0.003227,-0.012847,0.000329
4,2007-05,0.005101,-0.013952,0.016103,-0.005041,0.014628,-0.002065,0.021386,0.001226,0.004835,...,-0.015807,-0.025250,-0.000177,-0.007805,-0.008937,-0.018273,-0.011880,-0.019277,-0.002662,-0.011352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2023-08,0.028310,0.000713,0.008130,0.045222,-0.015994,-0.006345,-0.012137,-0.028126,0.006276,...,0.022846,0.061192,-0.015071,-0.008005,-0.023023,-0.033790,0.022846,0.061192,-0.013676,-0.008275
200,2023-09,0.020949,0.006397,-0.015742,0.032288,-0.022612,-0.030304,0.021729,0.004958,-0.008409,...,-0.007669,0.056403,-0.022960,-0.041631,-0.031377,-0.004162,-0.007669,0.056403,-0.022611,-0.034537
201,2023-10,-0.011122,-0.023761,-0.018125,-0.048684,0.000309,0.013658,-0.011122,-0.023761,0.006140,...,-0.001529,-0.059016,0.007342,0.022403,0.020352,-0.000113,0.038828,-0.046660,0.005460,0.017831
202,2023-11,-0.104784,-0.036467,-0.032976,0.010616,0.024487,0.042752,-0.104784,-0.036467,-0.027332,...,-0.023930,0.003578,0.031365,0.052037,0.013674,0.019215,-0.023930,0.003578,0.033881,0.046087


In [233]:
# Merge back FF4
df['YearMonth'] = pd.to_datetime(df['YearMonth']).dt.to_period('M')
df_ls = df_ls.merge(df[['YearMonth', 'MKT_RF', 'SMB', 'HML', 'RF', 'MOM']].drop_duplicates(['YearMonth']), on='YearMonth', how='left')
df_ls

Unnamed: 0,YearMonth,industry_car_1_decile_ls_ew,industry_car_1_decile_ls_vw,industry_car_1_quintile_ls_ew,industry_car_1_quintile_ls_vw,car_1_binary_ls_ew,car_1_binary_ls_vw,industry_car_5_decile_ls_ew,industry_car_5_decile_ls_vw,industry_car_5_quintile_ls_ew,...,industry_severity_decile_ls_vw,industry_severity_quintile_ls_ew,industry_severity_quintile_ls_vw,severity_binary_ls_ew,severity_binary_ls_vw,MKT_RF,SMB,HML,RF,MOM
0,2007-01,0.007277,0.004943,0.001989,0.003948,0.002154,0.017156,0.007702,0.004118,0.001989,...,-0.003835,0.001138,-0.001183,0.022697,0.024004,-0.0196,0.0123,-0.0008,0.0038,-0.0129
1,2007-02,0.004171,-0.004006,-0.002864,-0.004930,-0.014613,0.002539,0.003416,0.001363,-0.002864,...,-0.012058,-0.004829,-0.006912,-0.001348,-0.007103,0.0071,0.0009,-0.0088,0.0043,0.0258
2,2007-03,0.033134,-0.014144,0.004864,-0.022428,-0.008588,-0.005240,0.032126,-0.014725,0.006799,...,-0.018607,0.016364,-0.017561,-0.017539,-0.010587,0.0349,-0.0221,-0.0144,0.0044,-0.0016
3,2007-04,0.027454,0.023282,0.021274,0.018881,-0.010676,0.004426,0.029642,0.029143,0.018580,...,0.009323,-0.006397,0.003227,-0.012847,0.000329,0.0323,0.0016,-0.0059,0.0041,-0.0033
4,2007-05,0.005101,-0.013952,0.016103,-0.005041,0.014628,-0.002065,0.021386,0.001226,0.004835,...,-0.018273,-0.011880,-0.019277,-0.002662,-0.011352,-0.0196,0.0073,-0.0105,0.0040,0.0052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2023-08,0.028310,0.000713,0.008130,0.045222,-0.015994,-0.006345,-0.012137,-0.028126,0.006276,...,-0.033790,0.022846,0.061192,-0.013676,-0.008275,-0.0523,-0.0251,0.0149,0.0043,0.0033
200,2023-09,0.020949,0.006397,-0.015742,0.032288,-0.022612,-0.030304,0.021729,0.004958,-0.008409,...,-0.004162,-0.007669,0.056403,-0.022611,-0.034537,-0.0315,-0.0386,0.0020,0.0047,0.0167
201,2023-10,-0.011122,-0.023761,-0.018125,-0.048684,0.000309,0.013658,-0.011122,-0.023761,0.006140,...,-0.000113,0.038828,-0.046660,0.005460,0.017831,0.0888,-0.0010,0.0167,0.0044,0.0256
202,2023-11,-0.104784,-0.036467,-0.032976,0.010616,0.024487,0.042752,-0.104784,-0.036467,-0.027332,...,0.019215,-0.023930,0.003578,0.033881,0.046087,0.0485,0.0630,0.0489,0.0043,-0.0553


In [234]:
df_ls.to_csv('Output/portfolio_sum.csv', index=False)

# 6. Portfolios based on residuals

In [185]:
df = pd.read_csv('Output/panel_industries.csv')
df = df[df['YearMonth'] > '2006-12']
df

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,me_lag,month,month_prior,dateff,date_ff,MKT_RF,SMB,HML,RF,MOM
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,1.101515e+09,2007-02,2007-01,2007-02-28,2007-02-28,-0.0196,0.0123,-0.0008,0.0038,-0.0129
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,1.075262e+09,2007-03,2007-02,2007-03-30,2007-03-31,0.0071,0.0009,-0.0088,0.0043,0.0258
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,1.026004e+09,2007-04,2007-03,2007-04-30,2007-04-30,0.0349,-0.0221,-0.0144,0.0044,-0.0016
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,1.138959e+09,2007-05,2007-04,2007-05-31,2007-05-31,0.0323,0.0016,-0.0059,0.0041,-0.0033
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,1.212055e+09,2007-06,2007-05,2007-06-29,2007-06-30,-0.0196,0.0073,-0.0105,0.0040,0.0052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684173,349530.0,2023-08,0,0,0,0,0,0,0,0,...,6.746100e+06,2023-09,2023-08,2023-09-29,2023-09-30,-0.0523,-0.0251,0.0149,0.0043,0.0033
684174,349530.0,2023-09,0,0,0,0,0,0,0,0,...,5.241660e+06,2023-10,2023-09,2023-10-31,2023-10-31,-0.0315,-0.0386,0.0020,0.0047,0.0167
684175,349530.0,2023-10,0,0,0,0,0,0,0,0,...,3.880500e+06,2023-11,2023-10,2023-11-30,2023-11-30,0.0888,-0.0010,0.0167,0.0044,0.0256
684176,349530.0,2023-11,0,0,0,0,0,0,0,0,...,2.448297e+06,2023-12,2023-11,2023-12-29,2023-12-31,0.0485,0.0630,0.0489,0.0043,-0.0553


In [186]:
import statsmodels.api as sm

def _to_month_period(s: pd.Series) -> pd.Series:
    """Coerce to monthly PeriodIndex safely."""
    if isinstance(s.dtype, pd.PeriodDtype):
        try:
            return s.dt.asfreq("M")
        except Exception:
            return s.astype("period[M]")
    return pd.to_datetime(s, errors="coerce").dt.to_period("M")


def crosssec_residuals_month(
        df: pd.DataFrame,
        dep_vars,
        controls,
        industry_col: str = "SICS Codified Industry",
        month_col: str = "YearMonth",
):
    out = df.copy()

    # --- keys & basic checks
    if month_col not in out.columns:
        raise KeyError(f"{month_col} not found.")
    if industry_col not in out.columns:
        raise KeyError(f"Missing industry column: {industry_col}")

    out["_m_"] = _to_month_period(out[month_col])

    # coerce numerics for controls and y's
    for c in controls:
        if c not in out.columns:
            raise KeyError(f"Missing control: {c}")
        out[c] = pd.to_numeric(out[c], errors="coerce")

    for y in dep_vars:
        if y not in out.columns:
            raise KeyError(f"Missing dependent var: {y}")
        out[y] = pd.to_numeric(out[y], errors="coerce")
        out[f"resid_{y}"] = np.nan  # across industries (per month)
        out[f"resid_{y}_industry"] = np.nan  # within industry (per month×industry)

    def _fit_resid(sub: pd.DataFrame, y: str):
        """Return residuals (Series) of y ~ controls on sub; None if underidentified."""
        # drop any NA rows in variables used
        use_cols = [y] + controls
        dat = sub[use_cols].replace([np.inf, -np.inf], np.nan).dropna(how="any")
        if dat.empty:
            return None
        X = sm.add_constant(dat[controls].astype(float), has_constant="add")
        yy = dat[y].astype(float)
        # need more rows than parameters
        if X.shape[0] <= X.shape[1]:
            return None
        try:
            fit = sm.OLS(yy, X).fit()
        except Exception:
            return None
        return (yy - fit.fittedvalues)

    # --- loop by month (across industries)
    for m, idx_m in out.groupby("_m_").groups.items():
        sub_m = out.loc[idx_m]

        for y in dep_vars:
            # across industries (no suffix)
            r = _fit_resid(sub_m, y)
            if r is not None:
                out.loc[r.index, f"resid_{y}"] = r.values

            # within industry (month × industry)
            for ind, idx_mi in sub_m.groupby(industry_col, dropna=False).groups.items():
                sub_mi = out.loc[idx_mi]
                r_i = _fit_resid(sub_mi, y)
                if r_i is not None:
                    out.loc[r_i.index, f"resid_{y}_industry"] = r_i.values

    return out.drop(columns=["_m_"])

In [188]:
#dep_vars = ['n_car_1_material', 'n_car_5_material', 'n_reach_material', 'n_severity_material', 'n_material']
dep_vars = ['n_car_1_material_24m','n_car_5_material_24m','n_reach_material_24m','n_severity_material_24m','n_material_24m']
controls = ['size', 'bm_q', 'lev_q', 'roa_q']

res_df = crosssec_residuals_month(
    res_df,
    dep_vars=dep_vars,
    controls=controls,
    industry_col='SICS Codified Industry ',
    month_col='YearMonth'  # monthly cross-section
)

# sanity check: how many non-missing residuals per dep var?
res_df[[f"resid_{y}" for y in dep_vars]].notna().sum()

resid_n_car_1_material_24m       521699
resid_n_car_5_material_24m       521699
resid_n_reach_material_24m       521699
resid_n_severity_material_24m    521699
resid_n_material_24m             521699
dtype: int64

In [189]:
res_df.dropna(subset=['resid_n_car_1_material'], inplace=True)
res_df

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,resid_n_car_1_material_24m,resid_n_car_1_material_24m_industry,resid_n_car_5_material_24m,resid_n_car_5_material_24m_industry,resid_n_reach_material_24m,resid_n_reach_material_24m_industry,resid_n_severity_material_24m,resid_n_severity_material_24m_industry,resid_n_material_24m,resid_n_material_24m_industry
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,-0.006476,0.000000,-0.008683,0.000000,-0.008174,-0.021153,-0.007689,-0.021153,-0.007967,0.000000
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,-0.015307,0.000000,-0.017807,0.000000,-0.021362,-0.033665,-0.022219,-0.033665,-0.020247,0.000000
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,-0.024872,-0.013894,-0.025775,-0.013894,-0.039734,-0.061711,-0.040592,-0.061711,-0.036391,0.000000
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,-0.031598,-0.014231,-0.035641,-0.014231,-0.051720,-0.081160,-0.052690,-0.081160,-0.050291,0.000000
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,-0.039318,-0.014331,-0.048655,-0.014331,-0.066565,-0.081617,-0.068500,-0.081617,-0.068474,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684164,349530.0,2022-11,0,0,0,0,0,0,0,0,...,1.462127,1.567953,1.402292,2.142263,2.368094,2.639472,2.273180,2.639472,3.717023,3.487777
684165,349530.0,2022-12,0,0,0,0,0,0,0,0,...,1.462090,1.590250,1.402508,2.161199,2.410598,2.657414,2.315148,2.657414,3.732831,3.511751
684166,349530.0,2023-01,0,0,0,0,0,0,0,0,...,1.685352,2.330553,1.663749,3.164411,2.628800,4.010102,2.520299,4.010102,4.011676,5.128373
684167,349530.0,2023-02,0,0,0,0,0,0,0,0,...,1.694900,2.322271,1.663495,3.134583,2.627651,3.986913,2.531167,3.986913,4.051488,5.179705


In [217]:
def select_residual_tails(
    df: pd.DataFrame,
    resid_cols=None,                       # if None: all columns starting with 'resid_'
    date_col: str = "YearMonth",
    industry_col: str = "SICS Codified Industry",
    buckets=("decile","quintile","tertile"),
) -> pd.DataFrame:
    """
    Build top/bottom flags from residuals.
    - For residuals like 'resid_y' (across-industry): bucket within month.
    - For residuals like 'resid_y_industry' (within-industry): bucket within month×industry.
    IMPORTANT: 'top' = LOWEST residuals, 'bottom' = HIGHEST residuals (flipped).
    Ties go to the BOTTOM portfolio only.
    """
    out = df.copy()

    # month key
    if date_col not in out.columns:
        raise KeyError(f"{date_col} not found.")
    if isinstance(out[date_col].dtype, pd.PeriodDtype):
        out["_m_"] = out[date_col].dt.asfreq("M")
    else:
        out["_m_"] = pd.to_datetime(out[date_col], errors="coerce").dt.to_period("M")

    # which residual columns?
    if resid_cols is None:
        resid_cols = [c for c in out.columns if c.startswith("resid_")]
    resid_cols = [c for c in resid_cols if c in out.columns]
    if not resid_cols:
        return out.drop(columns=["_m_"])

    # bucket shares
    share = {"decile":0.10, "quintile":0.20, "tertile":1/3}
    buckets = [b for b in buckets if b in share]
    if not buckets:
        return out.drop(columns=["_m_"])

    # helpers: ties -> bottom
    def _top_flag_low(s: pd.Series, p: float) -> pd.Series:
        x = pd.to_numeric(s, errors="coerce")
        L = x.quantile(p, interpolation="lower")   # low cutoff
        # STRICT: exclude ties from top
        return x.lt(L) & x.notna()

    def _bottom_flag_high(s: pd.Series, p: float) -> pd.Series:
        x = pd.to_numeric(s, errors="coerce")
        H = x.quantile(1 - p, interpolation="higher")  # high cutoff
        # INCLUSIVE: include ties in bottom
        return x.ge(H) & x.notna()

    # groupers
    g_month = out.groupby(["_m_"], observed=True, sort=False)
    g_m_ind = (out.groupby(["_m_", industry_col], observed=True, sort=False)
               if industry_col in out.columns else None)

    for col in resid_cols:
        out[col] = pd.to_numeric(out[col], errors="coerce")
        is_within_industry = col.endswith("_industry")

        if is_within_industry:
            if g_m_ind is None:
                raise KeyError(f"Industry column '{industry_col}' missing for within-industry residuals.")
            grouper = g_m_ind
            suf = "_industry"     # keep your existing naming convention
        else:
            grouper = g_month
            suf = ""

        for b in buckets:
            p = share[b]
            top = grouper[col].transform(lambda s, p=p: _top_flag_low(s, p)).astype("bool")
            bottom = grouper[col].transform(lambda s, p=p: _bottom_flag_high(s, p)).astype("bool")

            # ensure exclusivity: remove any overlap from bottom
            bottom &= ~top

            out[f"{col}_top_{b}{suf}"]    = top.astype("Int8")
            out[f"{col}_bottom_{b}{suf}"] = bottom.astype("Int8")

    return out.drop(columns=["_m_"])


In [218]:
df_with_flags = select_residual_tails(
    res_df,
    resid_cols=None,
    date_col="YearMonth",
    industry_col="SICS Codified Industry ",
    buckets=("decile", "quintile", "tertile")
)

df_with_flags

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material,n_material_24m,n_nonmaterial_24m,...,resid_n_material_24m_top_quintile,resid_n_material_24m_bottom_quintile,resid_n_material_24m_top_tertile,resid_n_material_24m_bottom_tertile,resid_n_material_24m_industry_top_decile_industry,resid_n_material_24m_industry_bottom_decile_industry,resid_n_material_24m_industry_top_quintile_industry,resid_n_material_24m_industry_bottom_quintile_industry,resid_n_material_24m_industry_top_tertile_industry,resid_n_material_24m_industry_bottom_tertile_industry
0,1004.0,2007-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,1004.0,2007-02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,1004.0,2007-03,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,1
3,1004.0,2007-04,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,1
4,1004.0,2007-05,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684164,349530.0,2022-11,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
684165,349530.0,2022-12,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
684166,349530.0,2023-01,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
684167,349530.0,2023-02,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1


In [195]:
def portfolio_membership_counts(df, date_col="YearMonth", id_col=None):
    """
    Count portfolio membership per month for columns like:
      <signal>_(top|bottom)_(decile|quintile|tertile)[_industry]
      industry_<signal>_(top|bottom)_(decile|quintile|tertile)

    If id_col is None  -> count rows (firm-month obs).
    If id_col provided -> count unique ids (e.g., cusip, permno).
    """
    x = df.copy()
    x["_m_"] = _to_month_period(x[date_col])

    # detect top/bottom indicators (both suffix- and prefix-style), incl. tertile
    pat_suf = re.compile(
        r"^(?P<signal>.+)_(?P<leg>top|bottom)_(?P<bucket>decile|quintile|tertile)(?P<scope>_industry)?$")
    pat_pre = re.compile(r"^industry_(?P<signal>.+)_(?P<leg>top|bottom)_(?P<bucket>decile|quintile|tertile)$")

    cols = []
    for c in x.columns:
        m1, m2 = pat_suf.match(c), pat_pre.match(c)
        if not (m1 or m2):
            continue
        d = (m1 or m2).groupdict()
        scope = "industry" if (m1 and d.get("scope")) or m2 else "overall"
        key = f"{d['signal']}__{scope}__{d['bucket']}__{d['leg']}"
        cols.append((c, key))

    if not cols:
        return pd.DataFrame()

    # ensure numeric 0/1
    for c, _ in cols:
        x[c] = pd.to_numeric(x[c], errors="coerce").fillna(0).astype(int)

    rows = []
    for c, key in cols:
        mask = x[c] == 1
        if id_col:
            # unique ID count per month
            cnt = (x.loc[mask, ["_m_", id_col]]
                   .dropna(subset=[id_col])
                   .groupby("_m_")[id_col].nunique())
        else:
            # row count per month
            cnt = x.loc[mask].groupby("_m_").size()
        rows.append(cnt.rename(key))

    out = (pd.concat(rows, axis=1)
           .fillna(0)
           .astype(int)
           .reset_index()
           .rename(columns={"_m_": "YearMonth"}))
    return out


membership_rows = portfolio_membership_counts(df_with_flags, date_col="YearMonth")
membership_rows

Unnamed: 0,YearMonth,resid_n_car_1_material__overall__decile__top,resid_n_car_1_material__overall__decile__bottom,resid_n_car_1_material__overall__quintile__top,resid_n_car_1_material__overall__quintile__bottom,resid_n_car_1_material__overall__tertile__top,resid_n_car_1_material__overall__tertile__bottom,resid_n_car_1_material_industry__industry__decile__top,resid_n_car_1_material_industry__industry__decile__bottom,resid_n_car_1_material_industry__industry__quintile__top,...,resid_n_material_24m__overall__quintile__top,resid_n_material_24m__overall__quintile__bottom,resid_n_material_24m__overall__tertile__top,resid_n_material_24m__overall__tertile__bottom,resid_n_material_24m_industry__industry__decile__top,resid_n_material_24m_industry__industry__decile__bottom,resid_n_material_24m_industry__industry__quintile__top,resid_n_material_24m_industry__industry__quintile__bottom,resid_n_material_24m_industry__industry__tertile__top,resid_n_material_24m_industry__industry__tertile__bottom
0,2007-01,295,296,591,592,986,987,53,2410,110,...,591,592,986,987,35,2577,74,2616,124,2666
1,2007-02,295,296,591,592,985,986,76,2191,158,...,591,592,985,986,89,2065,185,2161,307,2283
2,2007-03,294,295,588,589,980,981,71,2237,146,...,588,589,980,981,125,1699,261,1835,433,2007
3,2007-04,285,286,570,571,950,951,5,2761,12,...,570,571,950,951,115,1706,242,1833,402,1993
4,2007-05,283,284,566,567,943,944,17,2616,38,...,566,567,943,944,130,1538,271,1679,453,1861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2023-08,261,262,522,523,870,871,150,1117,312,...,522,523,870,871,221,320,472,571,788,890
200,2023-09,259,260,518,519,863,864,180,758,382,...,518,519,863,864,217,316,467,566,782,885
201,2023-10,258,259,516,517,860,861,165,942,343,...,516,517,860,861,224,285,475,536,793,859
202,2023-11,256,257,512,513,853,854,191,651,402,...,512,513,853,854,222,283,473,534,785,849


In [219]:
def ls_excess_from_indicators(
        df: pd.DataFrame,
        date_col: str = "YearMonth",
        ret_col: str = "ret_adj",
        rf_col: str = "RF",
        weight_col: str = "me_lag",
):
    """
    Long–short *excess* returns (EW & VW) from binary flags with these names:
      overall:          <signal>_(top|bottom)_(tertile|quintile|decile)
      within industry:  <signal>_(top|bottom)_(tertile|quintile|decile)_industry
    (Legacy prefix 'industry_<signal>_...' is also supported → within industry.)

    Outputs per spec:
      <base>_<bucket>_ls_ew, <base>_<bucket>_ls_vw
    where <base> is a cleaned signal; prefixed with 'industry_' only for within-industry.
    """
    x = df.copy()

    # Month key → Period[M]
    if isinstance(x[date_col].dtype, pd.PeriodDtype):
        try:
            x["_m_"] = x[date_col].dt.asfreq("M")
        except Exception:
            x["_m_"] = x[date_col].astype("period[M]")
    else:
        x["_m_"] = pd.to_datetime(x[date_col].astype(str), errors="coerce").dt.to_period("M")

    # Excess returns & weights
    x["_rex_"] = (
            pd.to_numeric(x.get(ret_col, np.nan), errors="coerce")
            - pd.to_numeric(x.get(rf_col, 0.0), errors="coerce").fillna(0.0)
    )
    x["_w_"] = pd.to_numeric(x.get(weight_col, np.nan), errors="coerce")

    # Recognize columns: overall (no suffix) or within-industry (suffix _industry or legacy prefix)
    pat_suffix = re.compile(
        r"^(?P<signal>.+)_(?P<leg>top|bottom)_(?P<bucket>tertile|quintile|decile)"
        r"(?P<scope>_industry)?$"  # only '' or '_industry'
    )
    pat_prefix = re.compile(
        r"^(?P<pfx>industry_)(?P<signal>.+)_(?P<leg>top|bottom)_(?P<bucket>tertile|quintile|decile)$"
    )

    def scope_from(groups):
        s = groups.get("scope") or ""
        return "industry" if s == "_industry" else "overall"

    # Collect complete top/bottom pairs by (signal, scope, bucket)
    pairs = {}
    for c in x.columns:
        m = pat_suffix.match(c)
        if m:
            d = m.groupdict()
            key = (d["signal"], scope_from(d), d["bucket"])
        else:
            m = pat_prefix.match(c)
            if not m:
                continue
            d = m.groupdict()
            key = (d["signal"], "industry", d["bucket"])
        pairs.setdefault(key, {"top": None, "bottom": None})
        pairs[key][d["leg"]] = c

    specs = [
        (sig, scp, bkt, cols["top"], cols["bottom"])
        for (sig, scp, bkt), cols in pairs.items()
        if cols["top"] is not None and cols["bottom"] is not None
    ]
    if not specs:
        raise ValueError("No complete top/bottom pairs found for overall/industry.")

    # Clean signal name for output
    def _clean_signal(sig: str) -> str:
        s = sig
        s = re.sub(r"^n_", "", s)
        s = s.replace("resid_", "")
        s = s.replace("_material_24m", "").replace("_material", "")
        s = s.replace("_24m", "")
        return s.strip("_")

    # Safe mask conversion
    def _safe_mask(col: pd.Series) -> pd.Series:
        if col.dtype == bool:
            return col.astype("boolean").fillna(False)
        return pd.to_numeric(col, errors="coerce").eq(1).astype("boolean").fillna(False)

    # EW/VW on *excess* returns
    def _ew(mask: pd.Series) -> pd.Series:
        m = _safe_mask(mask)
        if not m.any():
            return pd.Series(dtype=float)
        return x.loc[m].groupby(x.loc[m, "_m_"])["_rex_"].mean()

    def _vw(mask: pd.Series) -> pd.Series:
        m = _safe_mask(mask)
        tmp = x.loc[m, ["_m_", "_rex_", "_w_"]].copy()
        if tmp.empty:
            return pd.Series(dtype=float)
        tmp["_w_"] = tmp["_w_"].clip(lower=0)
        sumw = tmp.groupby("_m_")["_w_"].sum()
        sumw = sumw.where(sumw > 0)
        tmp = tmp.join(sumw.rename("_sumw_"), on="_m_")
        tmp["_contrib_"] = tmp["_rex_"] * (tmp["_w_"] / tmp["_sumw_"])
        return tmp.groupby("_m_")["_contrib_"].sum()

    # Build LS series
    frames = []
    for sig, scp, bkt, top_col, bot_col in sorted(specs):
        ew_top, ew_bot = _ew(x[top_col]), _ew(x[bot_col])
        vw_top, vw_bot = _vw(x[top_col]), _vw(x[bot_col])

        idx = ew_top.index.union(ew_bot.index)
        ew_ls = ew_top.reindex(idx) - ew_bot.reindex(idx)

        idx_vw = vw_top.index.union(vw_bot.index)
        vw_ls = vw_top.reindex(idx_vw) - vw_bot.reindex(idx_vw)
        vw_ls = vw_ls.reindex(idx)

        base = _clean_signal(sig)
        if scp == "industry":
            base = f"industry_{base}"

        frames.append(pd.DataFrame({
            "YearMonth": idx,
            f"{base}_{bkt}_ls_ew": ew_ls.values,
            f"{base}_{bkt}_ls_vw": vw_ls.values,
        }))

    out = frames[0]
    for f in frames[1:]:
        out = out.merge(f, on="YearMonth", how="outer")

    return out.sort_values("YearMonth").reset_index(drop=True)

In [238]:
# df_with_indicators: your dataframe that already contains the *_top_* and *_bottom_* flags
ls_ret = ls_excess_from_indicators(df_with_flags)
ls_ret

Unnamed: 0,YearMonth,n_car_1_decile_ls_ew_x,n_car_1_decile_ls_vw_x,n_car_1_quintile_ls_ew_x,n_car_1_quintile_ls_vw_x,n_car_1_tertile_ls_ew_x,n_car_1_tertile_ls_vw_x,n_car_1_decile_ls_ew_y,n_car_1_decile_ls_vw_y,n_car_1_quintile_ls_ew_y,...,industry_n_severity_industry_quintile_ls_ew_x,industry_n_severity_industry_quintile_ls_vw_x,industry_n_severity_industry_tertile_ls_ew_x,industry_n_severity_industry_tertile_ls_vw_x,industry_n_severity_industry_decile_ls_ew_y,industry_n_severity_industry_decile_ls_vw_y,industry_n_severity_industry_quintile_ls_ew_y,industry_n_severity_industry_quintile_ls_vw_y,industry_n_severity_industry_tertile_ls_ew_y,industry_n_severity_industry_tertile_ls_vw_y
0,2007-01,-0.011608,0.027010,-0.014990,0.027870,-0.011199,0.025253,-0.011608,0.027010,-0.014990,...,0.004073,0.020268,0.007607,0.021354,0.005867,0.017249,0.004073,0.020268,0.007607,0.021354
1,2007-02,0.023792,-0.001560,0.021668,-0.000780,0.014970,-0.000406,0.018285,-0.002150,0.021342,...,-0.001991,-0.005625,0.004488,-0.004006,-0.011640,-0.009861,0.001484,-0.005812,0.005998,-0.004191
2,2007-03,-0.000206,-0.003245,0.010853,-0.002470,0.001556,-0.002962,-0.001574,-0.004578,0.010594,...,-0.000147,0.000230,0.001537,-0.000073,0.004748,0.012592,0.004825,0.007921,0.006692,0.007591
3,2007-04,0.029209,0.016530,0.030186,0.016776,0.029605,0.016780,0.025281,-0.001377,0.037019,...,0.024168,0.005923,0.024664,0.006951,0.032032,0.016191,0.036962,0.016951,0.031917,0.016610
4,2007-05,-0.041376,-0.006848,-0.031918,-0.007308,-0.030887,-0.008722,-0.038299,0.010754,-0.026516,...,0.004314,0.021641,-0.002658,0.015363,-0.001695,0.019257,0.003482,0.016398,0.000013,0.015459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2023-08,-0.008932,-0.005007,-0.002707,-0.006786,0.007327,-0.006915,-0.016707,-0.017475,-0.012885,...,-0.024098,-0.008724,-0.013483,-0.010958,-0.006419,-0.001044,-0.006926,-0.002650,-0.011183,-0.004977
200,2023-09,0.032432,0.008557,0.035469,0.004133,0.028862,0.001953,0.022388,-0.018016,0.016912,...,-0.005198,-0.034062,0.000350,-0.029074,0.030740,-0.014674,0.012007,-0.016603,0.012202,-0.018707
201,2023-10,0.014341,0.021605,0.001246,0.020323,-0.008353,0.020209,0.012509,0.010510,-0.000552,...,-0.009256,-0.003572,0.005963,-0.007328,0.023563,-0.004815,0.012933,0.000447,0.004748,-0.002840
202,2023-11,0.008895,0.034374,-0.027541,0.038095,-0.054642,0.041153,0.022856,0.041950,-0.015539,...,0.026213,0.039872,0.021766,0.035368,0.014455,0.023869,0.022893,0.029421,0.017590,0.028739


In [239]:
# remove _x from column names and replace _y with 24m
df_ls = ls_ret.rename(columns=lambda c: c.replace('_x', '').replace('_y', '_24m'))

In [241]:
# Merge back FF4
df_ls = df_ls.merge(df[['YearMonth', 'MKT_RF', 'SMB', 'HML', 'RF', 'MOM']].drop_duplicates(['YearMonth']), on='YearMonth', how='left')

In [242]:
df_ls

Unnamed: 0,YearMonth,n_car_1_decile_ls_ew,n_car_1_decile_ls_vw,n_car_1_quintile_ls_ew,n_car_1_quintile_ls_vw,n_car_1_tertile_ls_ew,n_car_1_tertile_ls_vw,n_car_1_decile_ls_ew_24m,n_car_1_decile_ls_vw_24m,n_car_1_quintile_ls_ew_24m,...,industry_n_severity_industry_decile_ls_vw_24m,industry_n_severity_industry_quintile_ls_ew_24m,industry_n_severity_industry_quintile_ls_vw_24m,industry_n_severity_industry_tertile_ls_ew_24m,industry_n_severity_industry_tertile_ls_vw_24m,MKT_RF,SMB,HML,RF,MOM
0,2007-01,-0.011608,0.027010,-0.014990,0.027870,-0.011199,0.025253,-0.011608,0.027010,-0.014990,...,0.017249,0.004073,0.020268,0.007607,0.021354,-0.0196,0.0123,-0.0008,0.0038,-0.0129
1,2007-02,0.023792,-0.001560,0.021668,-0.000780,0.014970,-0.000406,0.018285,-0.002150,0.021342,...,-0.009861,0.001484,-0.005812,0.005998,-0.004191,0.0071,0.0009,-0.0088,0.0043,0.0258
2,2007-03,-0.000206,-0.003245,0.010853,-0.002470,0.001556,-0.002962,-0.001574,-0.004578,0.010594,...,0.012592,0.004825,0.007921,0.006692,0.007591,0.0349,-0.0221,-0.0144,0.0044,-0.0016
3,2007-04,0.029209,0.016530,0.030186,0.016776,0.029605,0.016780,0.025281,-0.001377,0.037019,...,0.016191,0.036962,0.016951,0.031917,0.016610,0.0323,0.0016,-0.0059,0.0041,-0.0033
4,2007-05,-0.041376,-0.006848,-0.031918,-0.007308,-0.030887,-0.008722,-0.038299,0.010754,-0.026516,...,0.019257,0.003482,0.016398,0.000013,0.015459,-0.0196,0.0073,-0.0105,0.0040,0.0052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2023-08,-0.008932,-0.005007,-0.002707,-0.006786,0.007327,-0.006915,-0.016707,-0.017475,-0.012885,...,-0.001044,-0.006926,-0.002650,-0.011183,-0.004977,-0.0523,-0.0251,0.0149,0.0043,0.0033
200,2023-09,0.032432,0.008557,0.035469,0.004133,0.028862,0.001953,0.022388,-0.018016,0.016912,...,-0.014674,0.012007,-0.016603,0.012202,-0.018707,-0.0315,-0.0386,0.0020,0.0047,0.0167
201,2023-10,0.014341,0.021605,0.001246,0.020323,-0.008353,0.020209,0.012509,0.010510,-0.000552,...,-0.004815,0.012933,0.000447,0.004748,-0.002840,0.0888,-0.0010,0.0167,0.0044,0.0256
202,2023-11,0.008895,0.034374,-0.027541,0.038095,-0.054642,0.041153,0.022856,0.041950,-0.015539,...,0.023869,0.022893,0.029421,0.017590,0.028739,0.0485,0.0630,0.0489,0.0043,-0.0553


In [243]:
df_ls.to_csv('Output/portfolio_residual.csv', index=False)

# 7. Sample selection

In [107]:
# Reprisk sample
reprisk_incidents = pd.read_csv(
    'data/wrds_reprisk_incidents_ids_material_short_all.csv')  # Calculation in "CreateDataset" of "ESGmateriality" project
reprisk_incidents = reprisk_incidents[
    ['gvkey', 'isin', 'cusip', 'reprisk_id', 'story_id', 'adopter', 'sic', 'SICS Codified Industry ',
     'Codified SICS Sector ', 'severity', 'reach', 'novelty', 'incident_list', 'incident_date', 'car_5', 'car_1',
     'material_flag']]
# Create Year, YearMonth, YearQuarter columns
reprisk_incidents['incident_date'] = pd.to_datetime(reprisk_incidents['incident_date'])
reprisk_incidents['Year'] = reprisk_incidents['incident_date'].dt.year
reprisk_incidents['YearMonth'] = reprisk_incidents['incident_date'].dt.to_period('M')
reprisk_incidents['YearQuarter'] = reprisk_incidents['incident_date'].dt.to_period('Q')
reprisk_incidents['incident_list'] = reprisk_incidents['incident_list'].apply(ast.literal_eval)
exploded = reprisk_incidents[
    ['SICS Codified Industry ', 'Year', 'YearQuarter', 'YearMonth', 'reach', 'car_1', 'car_5', 'severity',
     'incident_list']] \
    .explode('incident_list') \
    .rename(columns={'incident_list': 'type'})

In [108]:
def pick_types_by_rolling_12m_period_dropna_balanced(
        df,
        industry_col='SICS Codified Industry ',
        ym_col='YearMonth',  # must be Period[M]
        type_col='type',
        metrics=('reach', 'severity', 'car_1', 'car_5'),
        window=12
):
    d = df.copy()
    d[ym_col] = d[ym_col].astype('period[M]')
    d = d[[industry_col, ym_col, type_col, *metrics]].copy()

    # numeric coercion, keep NaNs
    for m in metrics:
        d[m] = pd.to_numeric(d[m], errors='coerce')

    # aggregate to unique (industry, type, month)
    d = (
        d.groupby([industry_col, type_col, ym_col], as_index=False)[list(metrics)]
        .sum(min_count=1)
    )

    # ---- Balanced panel over global YearMonth (for existing (industry,type) pairs only)
    global_idx = pd.period_range(d[ym_col].min(), d[ym_col].max(), freq='M')
    pairs = d[[industry_col, type_col]].drop_duplicates()
    full = (
        pairs.assign(_k=1)
        .merge(pd.DataFrame({ym_col: global_idx, '_k': 1}), on='_k')
        .drop(columns='_k')
        .set_index([industry_col, type_col, ym_col])
    )
    base = d.set_index([industry_col, type_col, ym_col])
    d = full.join(base, how='left').reset_index()

    # rolling 12m sums by (industry, type); NaNs remain NaN
    d = d.sort_values([industry_col, type_col, ym_col])
    for m in metrics:
        d[f'roll12_{m}'] = (
            d.groupby([industry_col, type_col], group_keys=False)[m]
            .transform(lambda s: s.rolling(window, min_periods=1).sum())
        )

    by = [industry_col, ym_col]

    def pick_one(rollcol, take='max'):
        tmp = d[[industry_col, ym_col, type_col, rollcol]].dropna(subset=[rollcol]).copy()
        # tie-breaker: alphabetical type; for min we sort ascending by value, for max we invert via sort order
        asc_val = (take == 'min')
        tmp = tmp.sort_values(by + [rollcol, type_col],
                              ascending=[True, True, asc_val, True])
        picked = tmp.drop_duplicates(subset=by, keep='first')
        metric = rollcol.replace('roll12_', '')
        picked = picked.rename(columns={
            type_col: f'{metric}_type',
            rollcol: f'{metric}_roll12'
        })
        return picked[[industry_col, ym_col, f'{metric}_type', f'{metric}_roll12']]

    reach_out = pick_one('roll12_reach', 'max')
    severity_out = pick_one('roll12_severity', 'max')
    car1_out = pick_one('roll12_car_1', 'min')
    car5_out = pick_one('roll12_car_5', 'min')

    res = (reach_out.merge(severity_out, on=by, how='outer')
           .merge(car1_out, on=by, how='outer')
           .merge(car5_out, on=by, how='outer')
           .sort_values(by)
           .reset_index(drop=True))
    return res


# ---------------- Usage ----------------
result = pick_types_by_rolling_12m_period_dropna_balanced(exploded)
# Merge back to original dataframe
final = reprisk_incidents.merge(
    result,
    on=['SICS Codified Industry ', 'YearMonth'],
    how='left'
)


def _in_list(t, lst):
    if pd.isna(t):
        return 0
    return int(str(t) in lst)


# --- 3) create materiality flags
final['materiality_reach'] = [_in_list(t, L) for t, L in zip(final['reach_type'], final['incident_list'])]
final['materiality_severity'] = [_in_list(t, L) for t, L in zip(final['severity_type'], final['incident_list'])]
final['materiality_car_1'] = [_in_list(t, L) for t, L in zip(final['car_1_type'], final['incident_list'])]
final['materiality_car_5'] = [_in_list(t, L) for t, L in zip(final['car_5_type'], final['incident_list'])]
final

Unnamed: 0,gvkey,isin,cusip,reprisk_id,story_id,adopter,sic,SICS Codified Industry,Codified SICS Sector,severity,...,severity_type,severity_roll12,car_1_type,car_1_roll12,car_5_type,car_5_roll12,materiality_reach,materiality_severity,materiality_car_1,materiality_car_5
0,210418.0,US0003752047,000375204,2,1251.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264,0,0,1,1
1,210418.0,US0003752047,000375204,2,1305.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,violation_of_natl_legislation,18.0,impacts_on_communities,0.009035,impacts_on_communities,0.016264,0,0,0,0
2,210418.0,US0003752047,000375204,2,4029.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,human_rights_abuses,41.0,climate_ghg_pollution,-0.129756,impacts_on_communities,-0.282756,0,1,0,1
3,210418.0,US0003752047,000375204,2,4488.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,2.0,...,human_rights_abuses,54.0,climate_ghg_pollution,-0.129756,supply_chain_issues,-0.250865,1,1,0,0
4,210418.0,US0003752047,000375204,2,4756.0,1,3613.0,Electrical & Electronic Equipment,Resource Transformation,1.0,...,violation_of_natl_legislation,11.0,impacts_on_communities,-0.004342,impacts_on_communities,-0.007518,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331365,355240.0,THB131010001,,2683218,,1,5900.0,Coal Operations,Extractives & Minerals Processing,0.0,...,,,,,,,0,0,0,0
331366,349631.0,PK0126301016,,2684126,,1,3411.0,Containers & Packaging,Resource Transformation,0.0,...,,,,,,,0,0,0,0
331367,350690.0,GB00BLNNFY18,,2685085,,1,6726.0,Household & Personal Products,Consumer Goods,0.0,...,,,,,,,0,0,0,0
331368,350726.0,AU0000180200,,2685460,,1,1400.0,Chemicals,Resource Transformation,0.0,...,,,,,,,0,0,0,0


In [112]:
df = final.copy()

# Coerce flags to numeric and treat missing as 0
flag_cols = [
    'material_flag',
    'materiality_car_1', 'materiality_car_5', 'materiality_reach', 'materiality_severity'
]
df[flag_cols] = df[flag_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

key = ["gvkey", "YearMonth"]

# 1) Aggregate incidents per firm-quarter
agg_fq = (
    df.groupby(key, dropna=False)
    .agg(
        n_material=('material_flag', lambda s: (s == 1).sum()),
        n_nonmaterial=('material_flag', lambda s: (s == 0).sum()),
        n_car_1_material=('materiality_car_1', lambda s: (s == 1).sum()),
        n_car_5_material=('materiality_car_5', lambda s: (s == 1).sum()),
        n_reach_material=('materiality_reach', lambda s: (s == 1).sum()),
        n_severity_material=('materiality_severity', lambda s: (s == 1).sum()),
    )
    .reset_index()
)

# 2) Build the BALANCED universe = all firms × all quarters (global min..max)
firms = (
    df['gvkey']
    .dropna()
    .unique()
)
qmin = df['YearMonth'].min()
qmax = df['YearMonth'].max()
quarters = pd.period_range(qmin, qmax, freq='M')

balanced_idx = pd.MultiIndex.from_product([firms, quarters], names=key)
balanced_universe = pd.DataFrame(index=balanced_idx).reset_index()

# 3) Merge counts onto balanced universe and fill missing with zeros
reprisk = (
    balanced_universe
    .merge(agg_fq, on=key, how='left')
    .fillna(0)
    .sort_values(key)
    .reset_index(drop=True)
)

# 4) Cast counts to ints
count_cols = [
    'n_material', 'n_nonmaterial', 'n_car_1_material', 'n_car_5_material', 'n_reach_material', 'n_severity_material'
]
reprisk[count_cols] = reprisk[count_cols].astype('int16')

reprisk

Unnamed: 0,gvkey,YearMonth,n_material,n_nonmaterial,n_car_1_material,n_car_5_material,n_reach_material,n_severity_material
0,1004.0,2007-01,0,0,0,0,0,0
1,1004.0,2007-02,0,0,0,0,0,0
2,1004.0,2007-03,0,0,0,0,0,0
3,1004.0,2007-04,0,0,0,0,0,0
4,1004.0,2007-05,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
10640431,367496.0,2023-08,0,0,0,0,0,0
10640432,367496.0,2023-09,0,0,0,0,0,0
10640433,367496.0,2023-10,0,0,0,0,0,0
10640434,367496.0,2023-11,0,0,0,0,0,0


In [116]:
reprisk_merged = reprisk.merge(
    reprisk_incidents[['gvkey', 'cusip', 'SICS Codified Industry ', 'Codified SICS Sector ']].drop_duplicates(),
    on='gvkey', how='left')
reprisk_merged['year'] = reprisk_merged['YearMonth'].dt.year
agg_cols = [
    'n_material',
    'n_car_1_material',
    'n_car_5_material',
    'n_reach_material',
    'n_severity_material',
]

industry_col = 'SICS Codified Industry '  # use your exact column name
ym_col = 'YearMonth'  # can be string or Period[M]

# 1) Industry-level monthly counts (sum across firms)
industry_counts = (
    reprisk_merged.groupby([industry_col, ym_col], as_index=False)[agg_cols]
    .sum(min_count=1)  # keeps NaN if an entire group/col is NaN
    .rename(columns={c: f'industry_{c}' for c in agg_cols})
)

# 2) (Optional) merge back so each firm-month has the industry counts attached
df_with_industry = reprisk_merged.merge(industry_counts, on=[industry_col, ym_col], how='left')


In [126]:
df_with_industry['YearQuarter'] = df_with_industry['YearMonth'].dt.to_timestamp().dt.to_period('Q')

In [127]:
df_with_industry.isna().sum()

gvkey                                 0
YearMonth                             0
n_material                            0
n_nonmaterial                         0
n_car_1_material                      0
n_car_5_material                      0
n_reach_material                      0
n_severity_material                   0
cusip                           8252820
SICS Codified Industry                0
Codified SICS Sector                  0
year                                  0
industry_n_material                   0
industry_n_car_1_material             0
industry_n_car_5_material             0
industry_n_reach_material             0
industry_n_severity_material          0
YearQuarter                           0
dtype: int64

In [128]:
df_with_industry['gvkey'].nunique()

52159

In [129]:
df_with_industry.groupby('YearQuarter')['gvkey'].nunique().sum()

np.int64(3546812)

In [133]:
reprisk_incidents = pd.read_csv(
    'data/incidents_rolling_industry_all.csv')
reprisk_incidents.isna().sum()

gvkey                                 0
YearMonth                             0
n_material                            0
n_nonmaterial                         0
n_car_1_material                      0
n_car_5_material                      0
n_reach_material                      0
n_severity_material                   0
cusip                                 0
SICS Codified Industry                0
Codified SICS Sector                  0
year                                  0
industry_n_material                   0
industry_n_car_1_material             0
industry_n_car_5_material             0
industry_n_reach_material             0
industry_n_severity_material          0
date                                  0
mktrf                                 0
smb                                   0
hml                                   0
rf                                    0
umd                                   0
YearQuarter                           0
YearQuarter_prior                     0


In [102]:
reprisk_incidents['gvkey'].nunique()

11718

In [103]:
reprisk_incidents.groupby('YearQuarter')['gvkey'].nunique().sum()

np.int64(796824)

In [134]:
non_na = reprisk_incidents.dropna(subset=['size', 'bm_q', 'lev_q', 'roa_q'])

In [135]:
non_na['gvkey'].nunique()

8705

In [136]:
non_na.groupby('YearQuarter')['gvkey'].nunique().sum()

np.int64(260595)

In [137]:
non_na.groupby('YearQuarter')['gvkey'].nunique()

YearQuarter
2007Q1    4556
2007Q2    4374
2007Q3    4340
2007Q4    4308
2008Q1    4476
          ... 
2022Q4    3578
2023Q1    3773
2023Q2    3487
2023Q3    3489
2023Q4    3388
Name: gvkey, Length: 68, dtype: int64