# YMHA Data Processing Workflow

This notebook processes attendance and completion files, highlights TRUE (green) and FALSE (red), creates a 'complete' column, and organizes files by date and site. By default, it processes all dates, but you can configure it to process only one date.

In [1]:
import os
import re
import shutil
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

In [2]:
def process_attendance_files(folder_path, output_folder):
    """Highlight TRUE (green) and FALSE (red) in attendance files, save to output_folder/date/filename."""
    pattern = re.compile(
        r"ymha_attendance-site_(.+)-date_(2025-\d{2}-\d{2})\.xlsx$", re.IGNORECASE
    )
    red_fill = PatternFill(start_color="F4CCCC", end_color="F4CCCC", fill_type="solid")
    green_fill = PatternFill(
        start_color="D9EAD3", end_color="D9EAD3", fill_type="solid"
    )
    all_files = os.listdir(folder_path)
    files_of_interest = [fname for fname in all_files if pattern.search(fname)]
    for filename in files_of_interest:
        match = pattern.search(filename)
        if match:
            date_str = match.group(2)
            file_path = os.path.join(folder_path, filename)
            try:
                wb = load_workbook(file_path)
                ws = wb.active
                header = [
                    cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))
                ]
                try:
                    checkin_idx = header.index("Student Check-Ins") + 1
                    checkout_idx = header.index("Student Check-Outs") + 1
                except ValueError:
                    print(f"Columns not found in {filename}")
                    continue
                for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
                    for idx in [checkin_idx, checkout_idx]:
                        val = str(row[idx - 1].value).strip().upper()
                        if val == "FALSE":
                            row[idx - 1].fill = red_fill
                        elif val == "TRUE":
                            row[idx - 1].fill = green_fill
                date_folder = os.path.join(output_folder, date_str)
                os.makedirs(date_folder, exist_ok=True)
                new_path = os.path.join(date_folder, filename)
                wb.save(new_path)
            except Exception as e:
                print(f"Error processing {filename}: {e}")

In [3]:
def process_completion_files(folder_path, output_folder):
    """Create 'complete' column in completion files, highlight as needed, save to output_folder/complete/filename."""
    import openpyxl

    assessment_columns = [
        "AMHLQ",
        "Healthcare Access",
        "Impact and Professional Importance",
        "MAP + MH Interest",
        "MEIM-6",
        "Mental Health Attitudes and Help-Seeking",
        "Mental Healthcare",
        "PCRB",
        "PEARLS + CRISIS",
        "PSC",
        "SCCS",
        "SCCT",
        "SEHS-HE",
        "Student Week 1 Assessment",
    ]
    red_fill = PatternFill(start_color="F4CCCC", end_color="F4CCCC", fill_type="solid")
    yellow_fill = PatternFill(
        start_color="FFF2CC", end_color="FFF2CC", fill_type="solid"
    )
    green_fill = PatternFill(
        start_color="D9EAD3", end_color="D9EAD3", fill_type="solid"
    )
    complete_folder = os.path.join(output_folder, "complete")
    os.makedirs(complete_folder, exist_ok=True)
    files = os.listdir(folder_path)
    for file in files:
        if file.startswith("ymha_completion-site_") and file.endswith(".xlsx"):
            file_path = os.path.join(folder_path, file)
            wb = openpyxl.load_workbook(file_path)
            ws = wb.active
            header = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))]
            # Remove last two columns if they match
            for _ in range(2):
                if header and header[-1] in ["Student Week 2 Assessment", "complete"]:
                    ws.delete_cols(len(header))
                    header.pop()
            ws.cell(row=1, column=len(header) + 1, value="complete")
            header.append("complete")
            col_indices = [
                header.index(col) + 1 for col in assessment_columns if col in header
            ]
            complete_col_idx = len(header)
            for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
                values = []
                for idx in col_indices:
                    val = str(row[idx - 1].value).strip().upper()
                    values.append(val == "TRUE")
                if all(values) and values:
                    result = "TRUE"
                elif not any(values):
                    result = "FALSE"
                else:
                    result = "PARTIALLY TRUE"
                ws.cell(row=row[0].row, column=complete_col_idx, value=result)
                if result == "FALSE":
                    ws.cell(row=row[0].row, column=complete_col_idx).fill = red_fill
                elif result == "PARTIALLY TRUE":
                    ws.cell(row=row[0].row, column=complete_col_idx).fill = yellow_fill
                elif result == "TRUE":
                    ws.cell(row=row[0].row, column=complete_col_idx).fill = green_fill
            new_path = os.path.join(complete_folder, file)
            wb.save(new_path)

In [4]:
def find_all_attendance_dates(folder_path):
    """Recursively find all unique attendance dates in folder_path and its subfolders."""
    attendance_pattern = re.compile(
        r"ymha_attendance-site_(.+)-date_(2025-\d{2}-\d{2})\.xlsx$", re.IGNORECASE
    )
    all_dates = set()
    for root, dirs, files in os.walk(folder_path):
        for fname in files:
            match = attendance_pattern.match(fname)
            if match:
                all_dates.add(match.group(2))
    return all_dates

In [5]:
def organize_site_files_by_date(site_name=None, date_str=None, folder_path=None):
    """
    Organize all attendance and completion files for a given date into subfolders by site.
    For each site with an attendance file for the given date:
        - Create a subfolder under the date folder named after the site.
        - Copy the attendance file and the corresponding completion file into the site subfolder.
    If site_name is provided, only process that site for the given date.
    """
    if folder_path is None or date_str is None:
        print("folder_path and date_str are required.")
        return
    pattern = re.compile(r"ymha_attendance-site_(.+)-date_{}\.xlsx".format(date_str))
    attendance_folder = os.path.join(folder_path, date_str)
    complete_folder = os.path.join(folder_path, "complete")
    files = []
    for root, dirs, fs in os.walk(attendance_folder):
        files.extend(fs)
    if site_name:
        site_names = [site_name]
    else:
        site_names = [pattern.match(f).group(1) for f in files if pattern.match(f)]
    for site in site_names:
        attendance_file = f"ymha_attendance-site_{site}-date_{date_str}.xlsx"
        completion_file = f"ymha_completion-site_{site}.xlsx"
        site_folder = os.path.join(attendance_folder, site)
        os.makedirs(site_folder, exist_ok=True)
        att_src = os.path.join(attendance_folder, attendance_file)
        att_dst = os.path.join(site_folder, attendance_file)
        if os.path.exists(att_src):
            shutil.copy2(att_src, att_dst)
            print(f"Copied {attendance_file} to {site_folder}")
        else:
            print(f"Attendance file not found: {att_src}")
        comp_src = os.path.join(complete_folder, completion_file)
        comp_dst = os.path.join(site_folder, completion_file)
        if os.path.exists(comp_src):
            shutil.copy2(comp_src, comp_dst)
            print(f"Copied {completion_file} to {site_folder}")
        else:
            print(f"Completion file not found: {comp_src}")

In [6]:
# Set your folder path here
folder_path = "/Users/yanyi.wang/Desktop/ymha/output_7_8"  # Change as needed
# Set to a date string like "2025-07-03" to process only one date, or None for all
date_to_process = None  # e.g., "2025-07-03" or None

# Step 1: Process attendance files and save to attendance folders by date
process_attendance_files(folder_path, folder_path)
# Step 2: Process completion files and save to complete folder
process_completion_files(folder_path, folder_path)
# Step 3: Organize by site/date
if date_to_process:
    organize_site_files_by_date(date_str=date_to_process, folder_path=folder_path)
else:
    all_dates = find_all_attendance_dates(folder_path)
    for date_str in all_dates:
        organize_site_files_by_date(date_str=date_str, folder_path=folder_path)

Copied ymha_attendance-site_Pierce-date_2025-06-28.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/Pierce
Copied ymha_completion-site_Pierce.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/Pierce
Copied ymha_attendance-site_uc_riverside-date_2025-06-28.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/uc_riverside
Copied ymha_completion-site_uc_riverside.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/uc_riverside
Copied ymha_attendance-site_west_la-date_2025-06-28.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/west_la
Copied ymha_completion-site_west_la.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/west_la
Copied ymha_attendance-site_palomar-date_2025-06-28.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/palomar
Copied ymha_completion-site_palomar.xlsx to /Users/yanyi.wang/Desktop/ymha/output_7_8/2025-06-28/palomar
Copied ymha_attendance-site_ohlone-date_2025-06-28.xlsx to /Users/yanyi.wang/Des