# Attendance Data Processing Workflow

This notebook will:
1. Search for all files with 'attendence-site' in the filename and ending with a date in the format 2025-xx-xx.xlsx.
2. For each file, highlight in red all FALSE values in the columns 'Student Check-Ins' and 'Student Check-Outs'.
3. Organize the files into folders by date, grouping all sites for the same date together.

In [36]:
import os
import re
import shutil
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

In [None]:
def process_attendance_files(folder_path):
    # Regex to match files like attendence-site-...2025-xx-xx.xlsx
    # Regex to match files like ymha_attendance-site_{site_name}-date_{date}.xlsx
    pattern = re.compile(
        r"ymha_attendance-site_(.+)-date_(2025-\d{2}-\d{2})\.xlsx$", re.IGNORECASE
    )
    red_fill = PatternFill(start_color="F4CCCC", end_color="F4CCCC", fill_type="solid")
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} does not exist.")
        return

    # List all files in the folder
    all_files = os.listdir(folder_path)
    print(f"All files in folder: {all_files}")

    # Filter files of interest
    files_of_interest = [fname for fname in all_files if pattern.search(fname)]
    print(f"Files of interest: {files_of_interest}")

    for filename in files_of_interest:
        match = pattern.search(filename)
        if match:
            date_str = match.group(1)
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")
            try:
                wb = load_workbook(file_path)
                ws = wb.active
                # Find columns
                header = [
                    cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))
                ]
                try:
                    checkin_idx = header.index("Student Check-Ins") + 1
                    checkout_idx = header.index("Student Check-Outs") + 1
                except ValueError:
                    print(f"Columns not found in {filename}")
                    continue
                # Highlight FALSE in red
                for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
                    if str(row[checkin_idx - 1].value).strip().upper() == "FALSE":
                        row[checkin_idx - 1].fill = red_fill
                    if str(row[checkout_idx - 1].value).strip().upper() == "FALSE":
                        row[checkout_idx - 1].fill = red_fill
                # Save to date folder (not site)
                date_str = match.group(2)  # Use the date part for folder
                date_folder = os.path.join(folder_path, date_str)
                if not os.path.exists(date_folder):
                    os.makedirs(date_folder, exist_ok=True)
                    print(f"Created new folder: {date_folder}")
                new_path = os.path.join(date_folder, filename)
                wb.save(new_path)
                print(f"Processed and moved: {filename} -> {date_folder}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

In [28]:
# Set your folder path here
folder_path = "/Users/yanyi.wang/Desktop/ymha/output"


process_attendance_files(folder_path)

All files in folder: ['ymha_attendance-site_college_track-date_2025-06-21.xlsx', 'ymha_completion_summary-site_laney.xlsx', 'ymha_completion_summary-site_azusa_pacific.xlsx', 'ymha_attendance-site_palomar-date_2025-06-25.xlsx', 'ymha_attendance-site_laney-date_2025-06-27.xlsx', 'ymha_attendance-site_manuel_dominguez-date_2025-07-03.xlsx', 'ymha_attendance-site_uc_riverside-date_2025-06-24.xlsx', 'ymha_attendance-site_ohlone-date_2025-06-18.xlsx', 'ymha_attendance-site_Pierce-date_2025-06-28.xlsx', 'ymha_attendance-site_san_diego_city-date_2025-07-03.xlsx', 'ymha_attendance-site_el_camino-date_2025-07-01.xlsx', 'ymha_attendance-site_azusa_pacific-date_2025-06-16.xlsx', 'ymha_attendance-site_west_la-date_2025-06-25.xlsx', 'ymha_attendance-site_orange coast-date_2025-06-16.xlsx', 'ymha_attendance-site_contra_costa-date_2025-06-30.xlsx', 'ymha_attendance-site_long_beach_city-date_2025-06-17.xlsx', 'ymha_attendance-site_cal_sacramento-date_2025-07-02.xlsx', 'ymha_attendance-site_grossmont-d

# Complete Data Processing Workflow

This notebook will:
1. Search for all files with 'complete-site' in the filename.
2. For each file, create the complete column.
3. Organize the files into folder complete

In [37]:
import openpyxl
from openpyxl.styles import PatternFill

# Define columns to check
assessment_columns = [
    "AMHLQ",
    "Healthcare Access",
    "Impact and Professional Importance",
    "MAP + MH Interest",
    "MEIM-6",
    "Mental Health Attitudes and Help-Seeking",
    "Mental Healthcare",
    "PCRB",
    "PEARLS + CRISIS",
    "PSC",
    "SCCS",
    "SCCT",
    "SEHS-HE",
    "Student Week 1 Assessment",
]

# Define highlight fills
red_fill = PatternFill(start_color="F4CCCC", end_color="F4CCCC", fill_type="solid")
yellow_fill = PatternFill(start_color="FFF2CC", end_color="FFF2CC", fill_type="solid")

# Create output folder
complete_folder = os.path.join(folder_path, "complete")
os.makedirs(complete_folder, exist_ok=True)

for file in files:
    if file.startswith("ymha_completion-site_") and file.endswith(".xlsx"):
        file_path = os.path.join(folder_path, file)
        wb = openpyxl.load_workbook(file_path)
        ws = wb.active

        # Get header
        header = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))]
        # Remove last two columns if they match
        for _ in range(2):
            if header and header[-1] in ["Student Week 2 Assessment", "complete"]:
                ws.delete_cols(len(header))
                header.pop()

        # Add new 'complete' column
        ws.cell(row=1, column=len(header) + 1, value="complete")
        header.append("complete")

        # Get indices for assessment columns
        col_indices = [
            header.index(col) + 1 for col in assessment_columns if col in header
        ]
        complete_col_idx = len(header)

        for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
            values = []
            for idx in col_indices:
                val = str(row[idx - 1].value).strip().upper()
                values.append(val == "TRUE")
            if all(values):
                result = "TRUE"
            elif not any(values):
                result = "FALSE"
            else:
                result = "PARTIALLY TRUE"
            ws.cell(row=row[0].row, column=complete_col_idx, value=result)

            # Highlight as needed
            if result == "FALSE":
                ws.cell(row=row[0].row, column=complete_col_idx).fill = red_fill
            elif result == "PARTIALLY TRUE":
                ws.cell(row=row[0].row, column=complete_col_idx).fill = yellow_fill

        # Save to new folder
        new_path = os.path.join(complete_folder, file)
        wb.save(new_path)
        print(f"Processed and saved: {file} -> {new_path}")

Processed and saved: ymha_completion-site_compton.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_compton.xlsx
Processed and saved: ymha_completion-site_orange coast.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_orange coast.xlsx
Processed and saved: ymha_completion-site_grossmont.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_grossmont.xlsx
Processed and saved: ymha_completion-site_redlands.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_redlands.xlsx
Processed and saved: ymha_completion-site_chabot.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_chabot.xlsx
Processed and saved: ymha_completion-site_uc_riverside.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_uc_riverside.xlsx
Processed and saved: ymha_completion-site_cal_sacramento.xlsx -> /Users/yanyi.wang/Desktop/ymha/output/complete/ymha_completion-site_cal_sacra

In [None]:
def organize_site_files_by_date(site_name=None, date_str=None, folder_path=None):
    """
    Organize all attendance and completion files for a given date into subfolders by site.
    For each site with an attendance file for the given date:
        - Create a subfolder under the date folder named after the site.
        - Copy the attendance file and the corresponding completion file (from the 'complete' folder) into the site subfolder.
    If site_name is provided, only process that site for the given date.
    """
    if folder_path is None or date_str is None:
        print("folder_path and date_str are required.")
        return

    # Find all attendance files for the date
    pattern = re.compile(r"ymha_attendance-site_(.+)-date_{}\.xlsx".format(date_str))
    site_names = []
    if site_name:
        site_names = [site_name]
    else:
        site_names = [pattern.match(f).group(1) for f in files if pattern.match(f)]
    for site in site_names:
        attendance_file = f"ymha_attendance-site_{site}-date_{date_str}.xlsx"
        completion_file = f"ymha_completion-site_{site}.xlsx"

        date_folder = os.path.join(folder_path, date_str)
        site_folder = os.path.join(date_folder, site)
        os.makedirs(site_folder, exist_ok=True)

        att_src = os.path.join(folder_path, attendance_file)
        att_dst = os.path.join(site_folder, attendance_file)
        if os.path.exists(att_src):
            shutil.copy2(att_src, att_dst)
            print(f"Copied {attendance_file} to {site_folder}")
        else:
            print(f"Attendance file not found: {att_src}")

        comp_src = os.path.join(folder_path, "complete", completion_file)
        comp_dst = os.path.join(site_folder, completion_file)
        if os.path.exists(comp_src):
            shutil.copy2(comp_src, comp_dst)
            print(f"Copied {completion_file} to {site_folder}")
        else:
            print(f"Completion file not found: {comp_src}")


In [None]:
organize_site_files_by_date(date_str="2025-07-02", folder_path=folder_path)

Copied ymha_attendance-site_cal_sacramento-date_2025-07-02.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/cal_sacramento
Copied ymha_completion-site_cal_sacramento.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/cal_sacramento
Copied ymha_attendance-site_ohlone-date_2025-07-02.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/ohlone
Copied ymha_completion-site_ohlone.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/ohlone
Copied ymha_attendance-site_grossmont-date_2025-07-02.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/grossmont
Copied ymha_completion-site_grossmont.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/grossmont
Copied ymha_attendance-site_san_diego_city-date_2025-07-02.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/san_diego_city
Copied ymha_completion-site_san_diego_city.xlsx to /Users/yanyi.wang/Desktop/ymha/output/2025-07-02/san_diego_city
Copied ymha_attendance-site_manuel_dominguez-date_2025-07-02.xlsx to