<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update -qq
!apt-get install -y poppler-utils -qq
!pip install -q pdf2image huggingface_hub opencv-python pandas matplotlib openai

In [None]:
import os, glob, cv2, numpy as np, json
from pathlib import Path
from collections import defaultdict
from pdf2image import convert_from_path
from huggingface_hub import snapshot_download
from openai import OpenAI
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata

In [None]:
HF_TOKEN   = userdata.get('HF_TOKEN')      # your HF token
REPO_ID    = userdata.get('REPO_ID')       # e.g. "yourname/survey-pdfs"
PDF_DIR    = "/content/survey_pdfs"

os.makedirs(PDF_DIR, exist_ok=True)

pdf_dir = snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PDF_DIR,
    repo_type="dataset",
    allow_patterns=["*.pdf"],
    ignore_patterns=["*.json","*.md","*config*"]
)

pdf_paths = sorted(glob.glob(f"{pdf_dir}/**/*.pdf", recursive=True))
print(f"Downloaded {len(pdf_paths)} PDFs")

In [None]:
# DPI = 300 → image size ≈ 2480 × 3500 px
COLS = {
    'A': (750, 38, 38),   # x, w, h
    'B': (910, 38, 38),
    'C': (1070, 38, 38),
    'D': (1230, 38, 38)
}

# Y positions for Q1-20 (page 1)
Y_PAGE1 = [
    208, 253, 298, 343, 388,
    433, 478, 523, 568, 613,
    658, 703, 748, 793, 838,
    883, 928, 973,1018,1063
]

# Build ROIs for page 1
ROIS_PAGE1 = {}
for q, y in enumerate(Y_PAGE1, 1):
    ROIS_PAGE1[q] = {
        'A': (COLS['A'][0], y, COLS['A'][1], COLS['A'][2]),
        'B': (COLS['B'][0], y, COLS['B'][1], COLS['B'][2]),
        'C': (COLS['C'][0], y, COLS['C'][1], COLS['C'][2]),
        'D': (COLS['D'][0], y, COLS['D'][1], COLS['D'][2]),
    }

# Page 2 (Q21-23) – same spacing
ROIS_PAGE2 = {
    21: {k: (v[0], 208, v[1], v[2]) for k, v in ROIS_PAGE1[1].items()},
    22: {k: (v[0], 253, v[1], v[2]) for k, v in ROIS_PAGE1[1].items()},
    23: {k: (v[0], 298, v[1], v[2]) for k, v in ROIS_PAGE1[1].items()},
}

In [None]:
def preprocess(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    eq = cv2.equalizeHist(gray)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    enhanced = clahe.apply(eq)
    kernel = np.ones((5,5), np.uint8)
    closed = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
    thresh = cv2.adaptiveThreshold(closed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 41, 12)
    return thresh

In [None]:
def is_filled(roi, thresh, ratio=0.35):
    x, y, w, h = roi
    crop = thresh[y:y+h, x:x+w]
    if crop.size == 0: return False
    return np.sum(crop == 0) / (w*h) >= ratio

In [None]:
def extract(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    answers = ['X'] * 23

    # Page 1 (Q1-20)
    if len(pages) > 0:
        img = cv2.cvtColor(np.array(pages[0]), cv2.COLOR_RGB2BGR)
        thresh = preprocess(img)
        for q in range(1, 21):
            rois = ROIS_PAGE1[q]
            for choice in 'ABCD':
                if is_filled(rois[choice], thresh):
                    answers[q-1] = choice
                    break

    # Page 2 (Q21-23)
    if len(pages) > 1:
        img2 = cv2.cvtColor(np.array(pages[1]), cv2.COLOR_RGB2BGR)
        thresh2 = preprocess(img2)
        for q in range(21, 24):
            rois = ROIS_PAGE2[q]
            for choice in 'ABCD':
                if is_filled(rois[choice], thresh2):
                    answers[q-1] = choice
                    break

    return answers

In [None]:
all_answers = []
for i, p in enumerate(pdf_paths, 1):
    print(f"[{i}/{len(pdf_paths)}] {Path(p).name}")
    ans = extract(p)
    all_answers.append(ans)
    print(" →", "".join(ans))

print(f"\nParsed {len(all_answers)} PDFs")

Expected output for Accounting_1.pdf
 → ABABCBDXAADBCAXXACBDBAD

In [None]:
stats = defaultdict(lambda: defaultdict(int))
for row in all_answers:
    for q, ch in enumerate(row, 1):
        if ch != 'X':
            stats[q][ch] += 1

percentages = {}
for q, cnt in stats.items():
    tot = sum(cnt.values())
    per = {c: round(v/tot*100, 1) for c, v in cnt.items()}
    percentages[q] = dict(sorted(per.items(), key=lambda x: -x[1]))

print("\nRESULTS:")
for q in sorted(percentages):
    print(f"Q{q:02d}: {percentages[q]}")

In [None]:
XAI_API_KEY = userdata.get('XAI_API_KEY')
client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")

summary_prompt = f"""You are a data-analyst.
Aggregated results from {len(all_answers)} employees (23 questions, A=Strongly Agree … D=Strongly Disagree).

Percentages (top first):
{json.dumps(percentages, indent=2)}

Write a concise markdown report with:
1. Overall insights
2. Table of top answer per question
3. Patterns / anomalies
4. Visual description
5. 3 actionable recommendations
"""

resp = client.chat.completions.create(
    model="grok-4",
    messages=[{"role":"user","content":summary_prompt}],
    max_tokens=1500,
    temperature=0.3
)
print("\n### Grok-4 Report")
print(resp.choices[0].message.content)

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20))
axs = axs.ravel()
colors = {'A':'#2ca02c','B':'#1f77b4','C':'#ff7f0e','D':'#d62728'}
for i, (q, per) in enumerate(sorted(percentages.items())):
    ch, val = list(per.keys()), list(per.values())
    axs[i].bar(ch, val, color=[colors.get(c,'gray') for c in ch])
    axs[i].set_title(f'Q{q}')
    axs[i].set_ylim(0,100)
for ax in axs[len(percentages):]: ax.axis('off')
plt.tight_layout()
plt.show()

df = pd.DataFrame.from_dict(percentages, orient='index').fillna(0)
csv_path = f"{PDF_DIR}/survey_results.csv"
df.to_csv(csv_path)
print(f"CSV saved → {csv_path}")

In [None]:
def debug(pdf_path, page=0):
    pages = convert_from_path(pdf_path, dpi=300)
    img = cv2.cvtColor(np.array(pages[page]), cv2.COLOR_RGB2BGR)
    thresh = preprocess(img)
    rois = ROIS_PAGE1 if page == 0 else ROIS_PAGE2
    start_q = 1 if page == 0 else 21
    for q in range(start_q, start_q + (20 if page == 0 else 3)):
        r = rois[q]
        for ch in 'ABCD':
            x,y,w,h = r[ch]
            filled = is_filled((x,y,w,h), thresh)
            color = (0,255,0) if filled else (0,0,255)
            cv2.rectangle(img, (x,y), (x+w,y+h), color, 2)
            cv2.putText(img, f"{q}{ch}", (x,y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
    from google.colab.patches import cv2_imshow
    cv2_imshow(img)

debug(pdf_paths[0])  # Green = filled