Final Preprocessing Notebook

In [4]:
import pandas as pd
from collections import defaultdict

In [5]:
def preprocess_and_group_columns(df, columns_to_remove=None, prefixes_to_remove=None):
    """
    This function will take in columns and prefixes that the user feeds in to remove from the dataframe.
    It will then replace negative values (except -6 and -8) with NaN. 
    Finally it will print out how many columns were in each value pattern. 
    """
    if columns_to_remove is None:
        columns_to_remove = []
    if prefixes_to_remove is None:
        prefixes_to_remove = ()

    # Step 1: Identify columns to drop (specific columns and those with certain prefixes)
    to_drop = set(columns_to_remove)
    to_drop.update([col for col in df.columns if col.startswith(tuple(prefixes_to_remove))])

    # Step 2: Drop the above columns
    df_cleaned = df.drop(columns=[col for col in to_drop if col in df.columns])
    print(f"[Step 1] Dropped {len(to_drop)} columns. Remaining: {df_cleaned.shape[1]}")

    # Step 3: Group columns by their unique value patterns
    grouped_columns = defaultdict(list)

    # Step 4: Replace negative values, except -6 (refuses to answer) and -8 (don't know) with NaN
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype in ['int64', 'float64']:
            allowed_values = df_cleaned[col][(df_cleaned[col] >= 0) | (df_cleaned[col].isin([-6, -8]))].dropna().unique()
            value_set = tuple(sorted(allowed_values))
            grouped_columns[value_set].append(col)

    print(f"[Step 2] Grouped {len(df_cleaned.columns)} columns into {len(grouped_columns)} value patterns.")
    return grouped_columns

In [6]:
df = pd.read_csv('./data/mental-health-comorbidity-raw.csv')

admin_cols = ['RESPID', 'NCS1YR', 'AGE', 'STR', 'CASEID', 'COMPLETE', 'SECU', 'CASEWGT']
checkpoint_cols = ['M5A', 'IR3', 'IR11_4', 'IR36', 'IR47', 'PD0A', 'PD2', 'PD5', 'PD14', 'PD20', 'PD23', 'AG2', 'AG7', 'AG10', 
                   'FD4_1', 'FD6', 'FD7_1', 'FD9_1', 'PR1', 'PR11_1', 'PR15', 'FN1', 'FN4', 'FN24', 'CN1_2', 'CN4', 'CN4_1',
                   'CN7_1', 'CN7_2', 'CN8', 'CN14', 'DA36_2B', 'DA36_3A_1', 'DE20_3', 'DE20_6', 'CH23', 'CH38_1', 'CH74_1',
                   'CH104', 'AD0', 'AD2', 'AD7', 'AD29', 'AD31', 'AD36', 'AD43_2', 'OD2', 'OD27', 'CD3', 'CD17_1', 'CD24', 'SA1E_1',
                   'SA2', 'SA3', 'SA7A1', 'SA10', 'SA11E_1', 'SA12', 'SA18_5']

rem_cols = admin_cols + checkpoint_cols 

prefixes = ('IR48VALUES', 'PD27VALUES', 'PD28VALUES')

grouped = preprocess_and_group_columns(
    df,
    columns_to_remove=rem_cols,
    prefixes_to_remove= prefixes
)

# View the results
#for value_set, cols in grouped.items():
#    print(f"\nColumns with values {value_set}:")
#    for col in cols:
#        print(f"  - {col}")

FileNotFoundError: [Errno 2] No such file or directory: './data/mental-health-comorbidity-raw.csv'

In [None]:
# Sort groups by number of columns in each group (descending)
sorted_summary = sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)

# Display results
print("🔢 Most common non-negative value patterns by column count:\n")
for value_set, cols in sorted_summary:
    print(f"{value_set} → {len(cols)} columns")

🔢 Most common non-negative value patterns by column count:

(np.int64(-8), np.int64(-6), np.int64(1), np.int64(5)) → 174 columns
(np.int64(-6), np.int64(1), np.int64(5)) → 94 columns
(np.int64(1), np.int64(5)) → 86 columns
(np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 53 columns
(np.int64(0), np.int64(1)) → 41 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 40 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 26 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(5), np.int64(7)) → 16 columns
(np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)) → 15 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(2)) → 13 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9)) → 12 columns
(np.int64(-8), np.int64(1), np.int64(5)) → 10 columns
(np.int64(5),) → 7 columns
(np.int64(1), np.int64(2)) → 7 columns
(np.

In [None]:
def remove_sparse_value_sets(df, grouped, min_columns=4):
    """
    This function takes in a dataframe and a dictionary of grouped columns by their value sets.
    It removes columns that belong to value sets with fewer than `min_columns` columns.
    """
    # Flatten column names from large-enough value sets
    columns_to_keep = [
        col
        for value_set, cols in grouped.items()
        if len(cols) >= min_columns
        for col in cols
    ]
    
    # Subset the DataFrame
    return df[columns_to_keep]

In [None]:
# Filter grouped to only value sets with 4 or more columns
filtered_grouped = {value_set: cols for value_set, cols in grouped.items() if len(cols) >= 4}

# Sort and display the remaining sets by column count
sorted_remaining = sorted(filtered_grouped.items(), key=lambda x: len(x[1]), reverse=True)
print("✅ Remaining value sets (with 4 or more columns):\n")
for value_set, cols in sorted_remaining:
    print(f"{value_set} → {len(cols)} columns")

✅ Remaining value sets (with 4 or more columns):

(np.int64(-8), np.int64(-6), np.int64(1), np.int64(5)) → 174 columns
(np.int64(-6), np.int64(1), np.int64(5)) → 94 columns
(np.int64(1), np.int64(5)) → 86 columns
(np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 53 columns
(np.int64(0), np.int64(1)) → 41 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 40 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4)) → 26 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(5), np.int64(7)) → 16 columns
(np.int64(-6), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)) → 15 columns
(np.int64(-8), np.int64(-6), np.int64(1), np.int64(2)) → 13 columns
(np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9)) → 12 columns
(np.int64(-8), np.int64(1), np.int64(5)) → 10 columns
(np.int64(5),) → 7 columns
(np.int64(1), np.int64(2)) → 7 columns
(np.int64(-6),

In [None]:
# Display value sets and their associated columns
for value_set, cols in filtered_grouped.items():
    print(f"\n🔹 Columns with values {value_set} ({len(cols)} columns):")
    for col in cols:
        print(f"  - {col}")


🔹 Columns with values (np.int64(-6), np.int64(1), np.int64(5)) (94 columns):
  - M1
  - M18
  - M18B2
  - M18B3
  - M47
  - IR2
  - IR4
  - IR20
  - IR20B4
  - IR21
  - PD9
  - PD9B3
  - PD13A
  - PD13C
  - PD13D
  - PD17
  - PD21B2
  - PD21B3
  - PD25B
  - AG3INTR1
  - AG3INTR2
  - AAG3B2
  - AAG3B3
  - AAG3B4
  - AG4A
  - AG4B
  - AG4F
  - AG4G
  - AG4H
  - AG5
  - AG6
  - AAG6A2
  - AAG6A3
  - AG8
  - AG8A
  - AG9A
  - AG9B
  - AG9C
  - AG14
  - AG17
  - AG37
  - FD4A
  - PR16A
  - PR20
  - PR20B2
  - PR21
  - CN12B
  - LE3
  - LE4
  - LE9
  - LE11
  - LE12
  - CH61
  - CH61A
  - CH90
  - CH90A
  - AD3
  - AD3B2
  - AD4
  - AD6B
  - AD6C
  - AD6D
  - AD32
  - AD32B2
  - AD35A
  - AD35B
  - AD35C
  - AD35D
  - OD3B2
  - OD3B3
  - CD7B1
  - CD7B2
  - CD16
  - CD18C2
  - CD18C3
  - CD20
  - CD38
  - SA1F
  - SA1G
  - SA1H
  - SA1I
  - SA1J
  - SA1K
  - SA8
  - SA8B2
  - SA8B3
  - SA11F
  - SA11G
  - SA11H
  - SA11I
  - SA19
  - SA19B2
  - SA19B3
  - SA20

🔹 Columns with values (np.int

In [None]:
filtered_df = remove_sparse_value_sets(df, grouped, min_columns=4)
filtered_df.to_csv('./pp_outputs/filtered_data.csv', index=False)

In [None]:
# Here we explore our processed data
from data_columns import diagnosis
#create value groups to group diagnosis columns by their unique value sets
value_groups = defaultdict(list)
for col in diagnosis():
    if col in filtered_df.columns:
        unique_vals = frozenset(filtered_df[col].dropna().unique())
        value_groups[unique_vals].append(col)

print("\n🔷 Diagnosis columns with value set [1, 5]:")
one_five_cols = []
for unique_vals, cols in value_groups.items():
    if unique_vals == frozenset([1, 5]):
        one_five_cols.extend(cols)
        for col in cols:
            print(f"  - {col}")

print("\n🔶 Diagnosis columns with value set [0, 1, 2] or similar:")
zero_one_two_cols = []
for unique_vals, cols in value_groups.items():
    if unique_vals == frozenset([0, 1, 2]) or (unique_vals.issuperset({1, 2}) and 0 in unique_vals):
        zero_one_two_cols.extend(cols)
        for col in cols:
            print(f"  - {col}")


🔷 Diagnosis columns with value set [1, 5]:
  - DSM_ADD
  - DSM_AGO
  - DSM_AGOWO
  - DSM_ALA
  - DSM_ALAH
  - DSM_ALD
  - DSM_ASA
  - DSM_BIPO1
  - DSM_BIPO2
  - DSM_CON
  - DSM_DRA
  - DSM_DRAH
  - DSM_DRD
  - DSM_DYS
  - DSM_DYSH
  - DSM_GAD
  - DSM_GADH
  - DSM_HYP
  - DSM_IED
  - DSM_IEDH
  - DSM_IMJ
  - DSM_IMN
  - DSM_MAN
  - DSM_MJD
  - DSM_MJDH
  - DSM_MND
  - DSM_MNDH
  - DSM_ODD
  - DSM_ODDH
  - DSM_PAT
  - DSM_PD
  - DSM_PD_AGO
  - DSM_PD_WOAGO
  - DSM_PMS
  - DSM_PTSD
  - DSM_SAD
  - DSM_SO
  - DSM_SP
  - DSM_TBD

🔶 Diagnosis columns with value set [0, 1, 2] or similar:
  - DSM_ASP
  - DSM_BOR
  - DSM_PEA
  - DSM_PEC


In [None]:
# now lets look at the diagnoses with the most usable values
one_five_counts = {col: (df[col] == 1).sum() for col in one_five_cols}
zero_one_two_counts = {col: ((df[col] == 1) | (df[col] == 2)).sum() for col in zero_one_two_cols}

top_1_5 = sorted(one_five_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_0_1_2 = sorted(zero_one_two_counts.items(), key=lambda x: x[1], reverse=True)[:10]

combined_top = []

for col, count in top_1_5:
    combined_top.append((col, "1/5", count))

for col, count in top_0_1_2:
    combined_top.append((col, "0/1/2", count))

combined_top_sorted = sorted(combined_top, key=lambda x: x[2], reverse=True)

print("\n✅ Top 10 Diagnosis Fields (Sorted by Count):")
for col, coding, count in combined_top_sorted:
    print(f"{col:<30} | Type: {coding:<6} | Count: {count}")


✅ Top 10 Diagnosis Fields (Sorted by Count):
DSM_ASP                        | Type: 0/1/2  | Count: 4085
DSM_PEA                        | Type: 0/1/2  | Count: 3309
DSM_BOR                        | Type: 0/1/2  | Count: 3041
DSM_PEC                        | Type: 0/1/2  | Count: 2724
DSM_TBD                        | Type: 1/5    | Count: 1726
DSM_PAT                        | Type: 1/5    | Count: 1401
DSM_ALA                        | Type: 1/5    | Count: 1212
DSM_MJD                        | Type: 1/5    | Count: 995
DSM_ALAH                       | Type: 1/5    | Count: 960
DSM_SO                         | Type: 1/5    | Count: 808
DSM_MJDH                       | Type: 1/5    | Count: 804
DSM_SP                         | Type: 1/5    | Count: 762
DSM_IED                        | Type: 1/5    | Count: 500
DSM_GAD                        | Type: 1/5    | Count: 472


In [None]:
# now lets create a function where we can select a diagnoses, filter for usable columns
# and then filter out the other diagnosis columns
def get_filtered_features_for_diagnosis(df, diagnosis_col, positive_values, min_presence=0.3, columns_to_remove=None):
    if isinstance(positive_values, int):
        positive_values = [positive_values]
    if columns_to_remove is None:
        columns_to_remove = []

    diagnosed_df = df[df[diagnosis_col].isin(positive_values)].copy()
    print(f"[Subset] {diagnosed_df.shape[0]} rows match {diagnosis_col} ∈ {positive_values}")

    diagnosed_df.drop(columns=[col for col in columns_to_remove if col in diagnosed_df.columns], inplace=True)

    numeric_df = diagnosed_df.select_dtypes(include=['int64', 'float64'])

    valid_cols = [
        col for col in numeric_df.columns
        if (numeric_df[col] >= 0).mean() >= min_presence
    ]

    final_df = numeric_df[valid_cols]
    print(f"[Filter] {len(valid_cols)} features retained with ≥ {min_presence*100:.0f}% non-negative values")
    return final_df, valid_cols

In [None]:
cols_to_remove = diagnosis()
target_diagnosis = 'DSM_MJD'

filtered_features, retained_cols = get_filtered_features_for_diagnosis(
    df=filtered_df,
    diagnosis_col=target_diagnosis,
    positive_values=1,
    min_presence=0.3,
    columns_to_remove= cols_to_remove  
)

[Subset] 995 rows match DSM_MJD ∈ [1]
[Filter] 301 features retained with ≥ 30% non-negative values


In [None]:
filtered_features.to_csv(f'./pp_outputs/features_for_{target_diagnosis}.csv', index=False)

In [None]:
# here we can see how many columns
# are in each of our csv files thru the processing steps
csv_files = [
    './data/mental-health-comorbidity-raw.csv',
    './pp_outputs/filtered_data.csv',
    './pp_outputs/features_for_DSM_MJD.csv'
]
results = []
for file_path in csv_files:
    try:
        df=pd.read_csv(file_path)
        results.append({"file": file_path, "rows": df.shape[0], "columns": df.shape[1]})
    except Exception as e:
        results.append({"file": file_path, "error": str(e)})
result_df = pd.DataFrame(results)
display(result_df)

Unnamed: 0,file,rows,columns
0,./data/mental-health-comorbidity-raw.csv,5001,1000
1,./pp_outputs/filtered_data.csv,5001,648
2,./pp_outputs/features_for_DSM_MJD.csv,995,301
