In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter 

df= pd.read_csv('Memberships.csv')
print(df.columns.tolist())

print("Rows, columns: ", df.shape)
print(df.head())

df['age'] = pd.to_numeric(df['age'], errors = 'coerce')
df['household_income'] = pd.to_numeric(df['household_income'], errors ='coerce')

df['parent_household_type'] = df['parent_household_type'].astype('string')
df['program_group'] = np.select(
    [df['age'].between(5, 12, inclusive='both'), df['age'].between(13, 18, inclusive='both')], ['Youth', 'Teen'], default=np.nan)

def simplify_race(raw):
    if pd.isna(raw):
        return np.nan
    s=str(raw).strip().lower()
    if any(k in s for k in ['White']):
        return 'White'
    if any(k in s for k in ['Black', 'African American']):
        return 'Black'
    if any(k in s for k in ['Hispanic', 'Latino']):
        return 'Hispanic'
    if any(k in s for k in ['Native American', 'Alaskan Native']):
        return 'Native American/Alaskan Native'
    if any(k in s for k in ['Bi-racial']):
        return 'Bi-racial'
    if any(k in s for k in ['Multi-racial']):
        return 'Multi-racial'
    if any(k in s for k in ['Asian']):
        return 'Asian'
    if any(k in s for k in ['Middle Eastern or North African']):
        return 'Middle Eastern or North African'

    return str(raw).strip().title()

df['race_simple'] = df['race'].apply(simplify_race)

income_non_na = df[df['household_income'].notna()]
income_non_na_and_program = income_non_na[income_non_na['program_group'].notna()]

groups = ['Youth', 'Teen']
fig, axes = plt.subplots(1,2,figsize=(12, 5), constrained_layout=True)

for ax, g in zip(axes, groups):
    sub = income_non_na_and_program[income_non_na_and_program['program_group'] ==g]
    if sub.empty:
        ax.text(0.5, 0.5, f'No data for {g}', ha='center', va = 'center')
        ax.set_title(f'{g} (n=0)')
        ax.set_xlabel('Household income')
        ax.set_ylabel('Count')
        continue

    ax.hist(sub['household_income'].dropna(), bins=20)
    ax.set_title(f'{g} (n={len(sub)})')
    ax.set_xlabel('Household income')
    ax.set_ylabel('Count')

plt.subtitle('Household Income Distribution: Youth vs Teen')
plt.savefig('hist_income_youth_teen.png')
plt.show()

pht_non_na = df[df['parent_household_type'].notna() & df['program_group'].notna()]

for g in groups:
    sub=pht_non_na[pht_non_na['program_group'] ==g]
    if sub.empty:
        print(f"No parent_household_type data for {g}.")
        continue
    counts=sub['parent_household_type'].value_counts().sort_values(ascending=True)
    plt.figure(figsize=(8, max(3, 0.3*len(counts))))
    counts.plot.barh()
    plt.title(f'Parent Household Types - {g} (n={len(sub)})')
    plt.xlabel('Count')
    plt.tight_layout()
    fn=f'bar_parent_household_{g.lower()}.png'
    plt.savefig(fn)
    plt.show()


summary_rows=[]
race_groups=df['race_simple'].dropna().unique()

for race in race_groups:
    sub=df[df['race_simple'] == race]

    incomes = sub['household_inocme'].dropna()
    if len(incomes)==0:
        mean_income = np.nan
    else:
        mean_income = incomes.mean()

    phts=sub['parent_household_type'].dropna().astype(str)
    if len(phts)==0:
        most_common_pht = np.nan
        pht_count=0
    else:
        counter=Counter(phts)
        most_common_pht, pht_count =counter.most_common(1)[0]

    n_total = len(sub)
    n_income = len(incomes)
    n_pht = len(phts)
    summary_rows.append({
        'race': race, 
        'n_total': n_total, 
        'n_household_income_nonNA': n_income, 
        'mean_household_income': mean_income, 
        'n_parent_household_nonNA': n_pht,
        'most_common_parent_household': most_common_pht,
        'most_common_parent_household_count': pht_count
    })

summary_df = pd.DataFram(summary_rows).sort_values(by='n_total', ascending=False)
print("\nSummary by race:")
print(summary_df)

summary_df.to_csv('summary_by_race.csv', index=False)

summary_df['flag_small_income_sample'] = summary_df['n_household_income_nonNA'] <= 1
summary_df['flag_small_pht_sample'] = summary_df['n_parent_household_nonNA'] <= 1
summary_df.to_csv('summary_by_race_flagged.csv', index=False)

print("\nSaved plots: hist_income_youth_teen.png, bar_parent_household_youth.png, bar_parent_household_teen.png")
print("Saved CSVs: summary_by_race.csv, summary_by_race_flagged.csv")


['Contact: Card Number  ↑', 'Membership Type', 'Membership Status', 'Contact: Age', 'Contact: Data Warehouse: Racial/Ethnic Identity', 'Contact: Data Warehouse: Gender', 'Contact: Household Income Range', 'Contact: Household Composition', 'Contact: Zip Code']
Rows, columns:  (2646, 9)
  Contact: Card Number  ↑                        Membership Type  \
0                      21  Walworth School Year 24-25 Membership   
1                      22  Walworth School Year 24-25 Membership   
2                      23  Walworth School Year 24-25 Membership   
3                      24  Walworth School Year 24-25 Membership   
4                      25  Walworth School Year 24-25 Membership   

  Membership Status  Contact: Age  \
0          Complete          14.0   
1          Complete          13.0   
2          Complete          13.0   
3          Complete          11.0   
4          Complete          15.0   

  Contact: Data Warehouse: Racial/Ethnic Identity  \
0                       Black

KeyError: 'age'