In [1]:
import pandas as pd
from collections import defaultdict

In [5]:
def preprocess_and_group_columns(df, columns_to_remove=None, prefixes_to_remove=None):
    """
    Cleans the DataFrame by removing specified columns and prefix-matching columns,
    then groups remaining numeric columns by their unique non-negative value sets.

    Parameters:
        df (pd.DataFrame): The full DataFrame
        columns_to_remove (list): Exact column names to remove
        prefixes_to_remove (tuple or list): Column prefixes to remove (e.g., ('DSM_', 'INT_'))

    Returns:
        grouped_columns (dict): Keys = tuple of unique non-negative values, values = list of column names
    """
    if columns_to_remove is None:
        columns_to_remove = []
    if prefixes_to_remove is None:
        prefixes_to_remove = ()

    # Step 1: Build full list of columns to remove
    to_drop = set(columns_to_remove)
    to_drop.update([col for col in df.columns if col.startswith(tuple(prefixes_to_remove))])

    # Step 2: Drop columns
    df_cleaned = df.drop(columns=[col for col in to_drop if col in df.columns])
    print(f"[Step 1] Dropped {len(to_drop)} columns. Remaining: {df_cleaned.shape[1]}")

    # Step 3: Group by non-negative value sets
    grouped_columns = defaultdict(list)

    for col in df_cleaned.columns:
        if df_cleaned[col].dtype in ['int64', 'float64']:
            non_neg_values = df_cleaned[col][df_cleaned[col] >= 0].dropna().unique()
            value_set = tuple(sorted(non_neg_values))
            grouped_columns[value_set].append(col)

    print(f"[Step 2] Grouped {len(df_cleaned.columns)} columns into {len(grouped_columns)} value patterns.")
    return grouped_columns

In [None]:
df = pd.read_csv('./data/mental-health-comorbidity-raw.csv')

admin_cols = ['RESPID', 'NCS1YR', 'AGE', 'STR', 'CASEID', 'COMPLETE', 'SECU', 'CASEWGT']
checkpoint_cols = ['M5A', 'IR3', 'IR11_4', 'IR36', 'IR47', 'PD0A', 'PD2', 'PD5', 'PD14', 'PD20', 'PD23', 'AG2', 'AG7', 'AG10', 
                   'FD4_1', 'FD6', 'FD7_1', 'FD9_1', 'PR1', 'PR11_1', 'PR15', 'FN1', 'FN4', 'FN24', 'CN1_2', 'CN4', 'CN4_1',
                   'CN7_1', 'CN7_2', 'CN8', 'CN14', 'DA36_2B', 'DA36_3A_1', 'DE20_3', 'DE20_6', 'CH23', 'CH38_1', 'CH74_1',
                   'CH104', 'AD0', 'AD2', 'AD7', 'AD29', 'AD31', 'AD36', 'AD43_2', 'OD2', 'OD27', 'CD3', 'CD17_1', 'CD24', 'SA1E_1',
                   'SA2', 'SA3', 'SA7A1', 'SA10', 'SA11E_1', 'SA12', 'SA18_5']

rem_cols = admin_cols + checkpoint_cols 

prefixes = ('IR48VALUES', 'PD27VALUES', 'PD28VALUES', )

grouped = preprocess_and_group_columns(
    df,
    columns_to_remove=rem_cols,
    prefixes_to_remove= prefixes
)

# View the results
for value_set, cols in grouped.items():
    print(f"\nColumns with values {value_set}:")
    for col in cols:
        print(f"  - {col}")

[Step 1] Dropped 67 columns. Remaining: 933
[Step 2] Grouped 933 columns into 229 value patterns.

Columns with values (np.int64(1), np.int64(5)):
  - M1
  - M5
  - M9B
  - M18
  - M18B2
  - M18B3
  - M19
  - M47
  - IR1INTRO1
  - IR1INTRO2
  - IR2
  - IR4
  - IR20
  - IR20B3
  - IR20B4
  - IR21
  - IR48VALUES01
  - IR48VALUES02
  - IR48VALUES03
  - IR48VALUES04
  - IR48VALUES05
  - IR48VALUES06
  - IR48VALUES07
  - IR48VALUES08
  - IR48VALUES09
  - IR48VALUES10
  - IR48VALUES11
  - IR48VALUES12
  - IR48VALUES13
  - IR48VALUES15
  - IR48VALUES14
  - IR70
  - PD1A
  - PD1B
  - PD1C
  - PD1D
  - PD1E
  - PD1F
  - PD1G
  - PD1H
  - PD1I
  - PD1J
  - PD1K
  - PD1L
  - PD1M
  - PD1N
  - PD1O
  - PD1P
  - PD7
  - PD9
  - PD9B2
  - PD9B3
  - PD13A
  - PD13B
  - PD13C
  - PD13D
  - PD17
  - PD21B2
  - PD21B3
  - PD25A
  - PD25B
  - PD26
  - PD27VALUES01
  - PD27VALUES02
  - PD27VALUES03
  - PD27VALUES04
  - PD27VALUES05
  - PD27VALUES06
  - PD27VALUES07
  - PD27VALUES08
  - PD27VALUES09
  - PD

Really, this issue will manifest itself the most in regression/decision trees. We can use one-hot to help with this. 30% threshold also will help for weak categories

To discuss: should we remove columns that are unique in how they are answered? Should we only consider columns in the main categories

In [7]:
# grouped is your dictionary: {value_set: [list of column names]}
sorted_summary = sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)

# Display results
print("🔢 Most common non-negative value patterns by column count:\n")
for value_set, cols in sorted_summary:
    print(f"{value_set} → {len(cols)} columns")


🔢 Most common non-negative value patterns by column count:

(np.int64(1), np.int64(5)) → 406 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 121 columns
(np.int64(0), np.int64(1)) → 41 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)) → 25 columns
(np.int64(1), np.int64(2)) → 25 columns
(np.int64(1), np.int64(5), np.int64(7)) → 20 columns
(np.int64(1), np.int64(2), np.int64(3)) → 14 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9)) → 12 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(97)) → 9 columns
(np.int64(5),) → 7 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)) → 6 columns
(np.int64(0), np.int64(1), np.int64(2