In [1]:
import pandas as pd

df = pd.read_csv('imputed.csv')
df['strain_period']

0       Wildtype (Jan 2020 - Nov 2020)
1          Alpha (Dec 2020 - Jan 2021)
2       Wildtype (Jan 2020 - Nov 2020)
3       Wildtype (Jan 2020 - Nov 2020)
4       Wildtype (Jan 2020 - Nov 2020)
                     ...              
2723    Wildtype (Jan 2020 - Nov 2020)
2724    Wildtype (Jan 2020 - Nov 2020)
2725    Wildtype (Jan 2020 - Nov 2020)
2726       Alpha (Dec 2020 - Jan 2021)
2727      Delta (July 2021 - Nov 2021)
Name: strain_period, Length: 2728, dtype: object

In [2]:
df = pd.read_csv("imputed.csv")
for period in sorted(df['strain_period'].unique()):
    df_sub = df[df['strain_period'] == period]
    fname = f"imputed strain_{period}.csv"
    df_sub.to_csv(fname, index=False)
    print(f"Saved {len(df_sub)} rows to {fname}")

Saved 228 rows to imputed strain_Alpha (Dec 2020 - Jan 2021).csv
Saved 432 rows to imputed strain_Delta (July 2021 - Nov 2021).csv
Saved 157 rows to imputed strain_Later Omicron (BA.4, BA.5, XBB, July 2022+).csv
Saved 297 rows to imputed strain_Omicron (Dec 2021 - June 2022).csv
Saved 1614 rows to imputed strain_Wildtype (Jan 2020 - Nov 2020).csv


In [20]:
import pandas as pd
import numpy as np
from scipy import stats

def n_pcts(s):
    n = int(s.sum())
    pct = 100*n/len(s) if len(s)>0 else np.nan
    return f"{n} ({pct:.1f}%)"

groups = [('All', df)] + [
    (f"Period {p}", df[df['strain_period']==p])
    for p in sorted(df['strain_period'].dropna().unique())
]

rows = []
for name, sub in groups:
    age_mean, age_std = sub['age'].mean(), sub['age'].std()
    male = (sub['sex']=='M') if sub['sex'].dtype == object else (sub['sex']==1)
    rows.append({
        'Group': name,
        'N': len(sub),
        'Age, mean (SD)': f'{age_mean:.1f} ({age_std:.1f})',
        'Male, n (%)': n_pcts(male),
        'bad_hosp_dispo': n_pcts(sub['hosp_dispo_category']),
        'Inflam, n (%)': n_pcts(sub['com_neuro_inflam']),
        'Stroke, n (%)': n_pcts(sub['com_neuro_strok']),
        'MUSCC': n_pcts(sub['com_neuro_musc']),
        'dementia': n_pcts(sub['com_neuro_dem']),
        'psych': n_pcts(sub['com_neuro_psych']),
        'headache': n_pcts(sub['com_neuro_ha']),
        'TBI': n_pcts(sub['com_neuro_tbi']),
        'TME': n_pcts(sub['com_neuro_tme']),
        'movement': n_pcts(sub['com_neuro_move']),
        'BSTUM': n_pcts(sub['com_neuro_bstum']),
        'Cord': n_pcts(sub['com_neuro_cord']),
        'opthalmology': n_pcts(sub['com_neuro_opth']),
        'seizures': n_pcts(sub['com_neuro_sz']),
        'dysautonomia': n_pcts(sub['com_neuro_dysau']),
        'pain': n_pcts(sub['com_neuro_pain']),
        
    })
    
summary = pd.DataFrame(rows)

df2 = df.dropna(subset=['strain_period'])
periods = df2['strain_period'].dropna().unique()

def chi_p(col):
    table = pd.crosstab(df2['strain_period'], df2[col])
    chi2, p, _, _ = stats.chi2_contingency(table)
    return p

# mort_p = chi_p('mortality')
infl_p = chi_p('com_neuro_inflam')
stroke_p = chi_p('com_neuro_strok')

summary = summary.append({
    'Group': 'P-Value X2',
    'N': '',
}, ignore_index=True)

print(summary.to_string(index=False))

                                             Group    N Age, mean (SD)  Male, n (%) bad_hosp_dispo Inflam, n (%) Stroke, n (%)     MUSCC   dementia       psych   headache       TBI       TME  movement     BSTUM      Cord opthalmology   seizures dysautonomia      pain
                                               All 2728    53.2 (17.6) 1468 (53.8%)   1777 (65.1%)     65 (2.4%)    207 (7.6%) 69 (2.5%) 119 (4.4%) 444 (16.3%) 195 (7.1%) 29 (1.1%) 14 (0.5%) 23 (0.8%) 22 (0.8%) 23 (0.8%)     5 (0.2%) 117 (4.3%)    27 (1.0%) 80 (2.9%)
                Period Alpha (Dec 2020 - Jan 2021)  228    51.7 (16.6)  137 (60.1%)    109 (47.8%)      6 (2.6%)     17 (7.5%) 10 (4.4%)  14 (6.1%)  42 (18.4%) 33 (14.5%)  2 (0.9%)  1 (0.4%)  4 (1.8%)  6 (2.6%)  3 (1.3%)     1 (0.4%)  15 (6.6%)     2 (0.9%)  8 (3.5%)
               Period Delta (July 2021 - Nov 2021)  432    58.2 (17.2)  216 (50.0%)    426 (98.6%)     20 (4.6%)    63 (14.6%)  9 (2.1%)  32 (7.4%)  51 (11.8%)  21 (4.9%)  4 (0.9%)  2 (0.5%)  3 (0

  summary = summary.append({


In [25]:
sub = df.dropna(subset=['strain_period', 'hosp_dispo_category'])
grp = sub.groupby('strain_period')
rows = []
for period, g in grp:
    total = len(g)
    bad = int((g['hosp_dispo_category'] == 2).sum())
    pct = 100 * bad / total if total>0 else np.nan
    rows.append({
        'strain_period': period,
        'N': total,
        'n_bad': bad,
        '% bad': f"{pct:.1f}%"
    })
    
pd.DataFrame(rows).sort_values('strain_period')

Unnamed: 0,strain_period,N,n_bad,% bad
0,Alpha (Dec 2020 - Jan 2021),76,33,43.4%
1,Delta (July 2021 - Nov 2021),303,123,40.6%
2,"Later Omicron (BA.4, BA.5, XBB, July 2022+)",44,19,43.2%
3,Omicron (Dec 2021 - June 2022),199,53,26.6%
4,Wildtype (Jan 2020 - Nov 2020),642,285,44.4%
