<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Single page, read from google drive, not huggingface

In [1]:
# CELL 1 – Install
!apt-get update -qq
!apt-get install -y poppler-utils -qq
!pip install -q opencv-python pandas matplotlib openai huggingface_hub

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 125080 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
# CELL 2 – Imports & secrets
import os, glob, cv2, numpy as np, json, re
from pathlib import Path
from collections import defaultdict
from huggingface_hub import snapshot_download
from openai import OpenAI
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata

In [5]:
# CELL 3a – read PNGs from Google Drive, don't use cell 3b
HF_TOKEN = userdata.get('HF_TOKEN')
REPO_ID  = userdata.get('REPO_ID')
PNG_DIR  = userdata.get('PNG_TARGET_PATH')

from google.colab import drive
drive.mount('/content/drive')

png_paths = sorted(glob.glob(f"{PNG_DIR}/**/*.png", recursive=True))
print(f"Found {len(png_paths)} PNGs — each is one full survey")

Mounted at /content/drive
Found 9 PNGs — each is one full survey


In [None]:
# CELL 3b – Download PNGs from HF
HF_TOKEN = userdata.get('HF_TOKEN')
REPO_ID  = userdata.get('REPO_ID')
PNG_DIR  = "/content/survey_pngs"

# os.makedirs(PNG_DIR, exist_ok=True)  #UNCOMMENT THIS LINE TO USE
snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PNG_DIR,
    repo_type="dataset",
    allow_patterns=["**/*.png"]
)

png_paths = sorted(glob.glob(f"{PNG_DIR}/**/*.png", recursive=True))
print(f"Downloaded {len(png_paths)} PNGs — each is one full survey")

In [6]:
# CELL 4 – Single-page ROI (23 questions)
COLS = {'A':750, 'B':910, 'C':1070, 'D':1230}
WIDTH, HEIGHT = 38, 38
Y_START = 208
Y_STEP  = 45   # exact spacing in your forms

ROIS = {}
for q in range(1, 24):
    y = Y_START + (q-1) * Y_STEP
    ROIS[q] = {c: (COLS[c], y, WIDTH, HEIGHT) for c in 'ABCD'}

In [7]:
# CELL 5 – Ultra-sensitive preprocessing
def preprocess(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    eq = cv2.equalizeHist(gray)
    clahe = cv2.createCLAHE(clipLimit=7.0, tileGridSize=(8,8))
    enhanced = clahe.apply(eq)
    kernel = np.ones((13,13), np.uint8)   # catches microscopic dots
    closed = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
    thresh = cv2.adaptiveThreshold(closed, 255,
                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 81, 35)
    return thresh

In [8]:
# CELL 6 – Detect any mark
def is_filled(roi, thresh, ratio=0.15):
    x,y,w,h = roi
    crop = thresh[y:y+h, x:x+w]
    if crop.size == 0: return False
    return np.sum(crop == 0) / (w*h) >= ratio

In [9]:
# CELL 7 – Extract one PNG
def extract(png_path):
    img = cv2.imread(png_path)
    t = preprocess(img)
    answers = ['X'] * 23
    for q in range(1, 24):
        for c in 'ABCD':
            if is_filled(ROIS[q][c], t):
                answers[q-1] = c
                break
    return answers

In [10]:
# CELL 8 – RUN
all_answers = []
for p in png_paths:
    name = Path(p).stem
    print(f"{name}")
    ans = extract(p)
    all_answers.append(ans)
    print("".join(ans))

print(f"\n{len(all_answers)} surveys parsed")

Accounting_1_page001
AAAAAAAAAAAAAAAAAAAAAAA
Accounting_2_page001
AAAAAAAAAAAAAAAAAAAAAAA
Accounting_3_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 1_1_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 1_2_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 2_1_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 2_2_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 3_1_page001
AAAAAAAAAAAAAAAAAAAAAAA
Area A Shift 3_2_page001
AAAAAAAAAAAAAAAAAAAAAAA

9 surveys parsed


In [None]:
# CELL 9 – Aggregate + Grok summary
stats = defaultdict(lambda: defaultdict(int))
for row in all_answers:
    for q,ch in enumerate(row,1):
        if ch!='X': stats[q][ch] += 1

percentages = {}
for q,cnt in stats.items():
    tot = sum(cnt.values())
    percentages[q] = {c: round(v/tot*100,1) for c,v in cnt.items()}

client = OpenAI(api_key=userdata.get('XAI_API_KEY'), base_url="https://api.x.ai/v1")
prompt = f"""23-question single-page survey results ({len(all_answers)} responses)
A=Strongly Agree, B=Agree, C=Disagree, D=Strongly Disagree

Top % per question:
{json.dumps(percentages, indent=2)}

Write a 1-page markdown report with:
1. Top 3 strongest / weakest areas
2. Table: Q | Top answer | %
3. One surprising pattern
4. 3 bullet recommendations
"""

resp = client.chat.completions.create(model="grok-4", messages=[{"role":"user","content":prompt}], max_tokens=1200, temperature=0.3)
print("\n### Grok-4 Summary")
print(resp.choices[0].message.content)

In [None]:
# CELL 10 – Charts + CSV
fig, axs = plt.subplots(5,5,figsize=(20,18))
axs = axs.ravel()
colors = {'A':'#2ca02c','B':'#1f77b4','C':'#ff7f0e','D':'#d62728'}
for i,(q,per) in enumerate(sorted(percentages.items())):
    ch,val = list(per.keys()), list(per.values())
    axs[i].bar(ch,val,color=[colors.get(c,'gray') for c in ch])
    axs[i].set_title(f'Q{q}'); axs[i].set_ylim(0,100)
for ax in axs[len(percentages):]: ax.axis('off')
plt.tight_layout(); plt.show()

df = pd.DataFrame.from_dict(percentages,orient='index').fillna(0)
df.to_csv("single_page_results.csv")
from google.colab import files
files.download("single_page_results.csv")

In [None]:
def debug(png_path):
    img = cv2.imread(png_path)
    t = preprocess(img)
    for q in range(1,24):
        for c in 'ABCD':
            x,y,w,h = ROIS[q][c]
            fill = is_filled((x,y,w,h), t)
            cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0) if fill else (0,0,255),2)
            cv2.putText(img,f"{q}{c}",(x,y-8),0,0.5,(0,255,0) if fill else (0,0,255),1)
    from google.colab.patches import cv2_imshow
    cv2_imshow(img)

debug(png_paths[0])