<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 – Install everything
!apt-get update -qq
!apt-get install -y tesseract-ocr poppler-utils -qq
!pip install -q pdf2image huggingface_hub openai pillow pymupdf opencv-python pandas matplotlib

In [None]:
# Cell 2 – Imports & secrets (skip HF download for samples; use local)
import os, glob, json, cv2, numpy as np
from io import BytesIO
from pathlib import Path
from collections import defaultdict
from pdf2image import convert_from_path
from openai import OpenAI
from huggingface_hub import snapshot_download
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata, files  # For optional upload

# For samples, set PDF_DIR to current dir or upload
HF_TOKEN   = userdata.get('HF_TOKEN')      # HF token for private repo
REPO_ID    = userdata.get('REPO_ID')       # e.g. "yourname/survey-pdfs"
XAI_API_KEY= userdata.get('XAI_API_KEY')   # xAI API key
PDF_DIR    = userdata.get('PDF_FOLDER_PATH') or "/content/pdfs"

In [None]:
# DON'T USE THIS
# Cell 3 – (Skip HF; use local samples or upload)  Find local PDFs (your 6 + template)
pdf_paths = sorted(glob.glob("Accounting_*.pdf") + glob.glob("Employee Survey v4.pdf"))
if not pdf_paths:
    print("No PDFs found. Upload via Colab: files.upload()")
    uploaded = files.upload()
    pdf_paths = sorted(glob.glob("*.pdf"))
print(f"Found {len(pdf_paths)} PDF files: { [Path(p).name for p in pdf_paths] }")

In [None]:
os.makedirs(PDF_DIR, exist_ok=True)
pdf_dir = snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PDF_DIR,
    repo_type="dataset",
    allow_patterns=["*.pdf"],
    ignore_patterns=["*.json","*.md","*config*"]
)
pdf_paths = sorted(glob.glob(f"{pdf_dir}/**/*.pdf", recursive=True))
print(f"Found {len(pdf_paths)} PDF files")

In [None]:
os.makedirs(PDF_DIR, exist_ok=True)
pdf_dir = snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PDF_DIR,
    repo_type="dataset",
    allow_patterns=["*.pdf"],
    ignore_patterns=["*.json","*.md","*config*"]
)
pdf_paths = sorted(glob.glob(f"{pdf_dir}/**/*.pdf", recursive=True))
print(f"Found {len(pdf_paths)} PDF files")

In [None]:
# Cell 4 – Updated ROI definition (shifted + calibrated)
NUM_QUESTIONS = 23

# Updated columns (shifted right to avoid text overlap; based on your image pixels at DPI=300)
COL_A = (650, 0, 35, 35)   # Strongly Agree
COL_B = (810, 0, 35, 35)   # Agree
COL_C = (970, 0, 35, 35)   # Disagree
COL_D = (1130, 0, 35, 35)  # Strongly Disagree

# Calibrated y-offsets for page 1 (measured from your attached image: Q1 at ~205px, +45px/row)
Y_OFFSETS_PAGE1 = [
    205, 250, 295, 340, 385,   # Q1-5
    430, 475, 520, 565, 610,   # Q6-10
    655, 700, 745, 790, 835,   # Q11-15
    880, 925, 970, 1015, 1060  # Q16-20
]

Y_OFFSETS_PAGE2 = [205, 250, 295]  # Q21-23 (assumed same as top of page 1)

# Build ROIs
CHECKBOX_ROIS = {}
for q, y in enumerate(Y_OFFSETS_PAGE1, 1):
    CHECKBOX_ROIS.setdefault(0, {})[q] = [
        (COL_A[0], y + COL_A[1], COL_A[2], COL_A[3]),
        (COL_B[0], y + COL_B[1], COL_B[2], COL_B[3]),
        (COL_C[0], y + COL_C[1], COL_C[2], COL_C[3]),
        (COL_D[0], y + COL_D[1], COL_D[2], COL_D[3]),
    ]
for q, y in enumerate(Y_OFFSETS_PAGE2, 21):
    CHECKBOX_ROIS.setdefault(1, {})[q] = [
        (COL_A[0], y + COL_A[1], COL_A[2], COL_A[3]),
        (COL_B[0], y + COL_B[1], COL_B[2], COL_B[3]),
        (COL_C[0], y + COL_C[1], COL_C[2], COL_C[3]),
        (COL_D[0], y + COL_D[1], COL_D[2], COL_D[3]),
    ]

print("ROIs calibrated. Ready to process.")

In [None]:
# Cell 5 – Enhanced preprocessing + detection (extra EQ for low contrast)
def preprocess_image(img_cv):
    """Enhanced for grey fills: Extra histogram EQ + existing pipeline."""
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)

    # NEW: Histogram equalization for overall contrast boost (helps grey)
    eq = cv2.equalizeHist(gray)

    # CLAHE on equalized
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))  # Slightly higher clip
    enhanced = clahe.apply(eq)

    # Closing to connect faint strokes
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (4,4))  # Slightly larger kernel
    closed = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)

    # Adaptive threshold
    thresh = cv2.adaptiveThreshold(closed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 35, 10)  # Tweaked params

    return thresh

def is_filled(roi, thresh_img, fill_ratio=0.40):  # Lowered for sensitivity
    x, y, w, h = roi
    if y + h > thresh_img.shape[0] or x + w > thresh_img.shape[1]:
        return False
    crop = thresh_img[y:y+h, x:x+w]
    if crop.size == 0:
        return False
    black = np.sum(crop == 0)
    ratio = black / (w * h)
    return ratio >= fill_ratio

def extract_answers(pdf_path):
    try:
        pages = convert_from_path(pdf_path, dpi=300)
    except Exception as e:
        print(f"Conversion error {pdf_path}: {e}")
        return None

    answers = ['X'] * NUM_QUESTIONS
    for page_idx, pil_img in enumerate(pages):
        img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
        thresh = preprocess_image(img_cv)

        q_start = 1 if page_idx == 0 else 21
        q_end = 20 if page_idx == 0 else 23

        for q in range(q_start, q_end + 1):
            rois = CHECKBOX_ROIS.get(page_idx, {}).get(q)
            if not rois:
                continue
            for col_idx, roi in enumerate(rois, 1):
                if is_filled(roi, thresh):
                    answers[q-1] = chr(64 + col_idx)  # A=65, B=66, etc.
                    break  # Single fill per row
    return answers

In [None]:
# Cell 6 – Process PDFs (now accurate, mixed results)
all_answers = []
for i, p in enumerate(pdf_paths, 1):
    print(f"[{i}/{len(pdf_paths)}] {Path(p).name}")
    ans = extract_answers(p)
    if ans and all(a != 'X' for a in ans[:20]):  # Check page 1 success
        all_answers.append(ans)
        print(f"   → Success: {''.join(ans[:5])}... (first 5)")
    else:
        print("   → Partial/fail (check page 2 or tweak ROIs)")

print(f"\nParsed {len(all_answers)} / {len(pdf_paths)} PDFs with valid data")

In [None]:
# Cell 7 – Aggregate percentages (expect mixed A/B/C/D)
stats = defaultdict(lambda: defaultdict(int))
for row in all_answers:
    for q, choice in enumerate(row, 1):
        if choice != 'X':
            stats[q][choice] += 1

percentages = {}
total_resp = len(all_answers)
for q, cnt in stats.items():
    tot = sum(cnt.values())
    if tot > 0:
        per = {c: round(v / tot * 100, 1) for c, v in cnt.items()}
        percentages[q] = dict(sorted(per.items(), key=lambda x: -x[1]))

print(f"Total responses: {total_resp}")
for q, p in sorted(percentages.items()):
    print(f"Q{q:2d}: {p}")

In [None]:
# Cell 8 – Grok-4 summary (1 call)
client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")
MODEL = "grok-4"  # Or "grok-4-fast-reasoning"

summary_prompt = f"""You are a data analyst.
Survey results from {total_resp} respondents (23 Qs: A=Strongly Agree, B=Agree, C=Disagree, D=Strongly Disagree).

Percentages (top first):
{json.dumps(percentages, indent=2)}

Professional markdown report:
1. Key insights (e.g., strong agreement areas).
2. Table: Q | Top Choice | %
3. Patterns/anomalies (e.g., management fairness).
4. Chart description.
5. 3 recommendations.

Concise (<600 words)."""

resp = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": summary_prompt}],
    max_tokens=1500,
    temperature=0.3
)
print("\n### Grok-4 Report")
print(resp.choices[0].message.content)

In [None]:
# Cell 9 – Charts + CSV Bar charts
rows = 5
fig, axs = plt.subplots(rows, 5, figsize=(20, 20))
axs = axs.ravel()

colors = {'A': '#2ca02c', 'B': '#1f77b4', 'C': '#ff7f0e', 'D': '#d62728'}
for idx, (q, per) in enumerate(sorted(percentages.items())):
    choices = list(per.keys())
    vals = list(per.values())
    bars = axs[idx].bar(choices, vals, color=[colors.get(c, 'gray') for c in choices])
    axs[idx].set_title(f'Q{q}')
    axs[idx].set_ylim(0, 100)
    # Add % labels
    for bar, v in zip(bars, vals):
        axs[idx].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{v}%', ha='center', va='bottom')

for ax in axs[len(percentages):]:
    ax.axis('off')
plt.tight_layout()
plt.show()

# CSV
df = pd.DataFrame.from_dict(percentages, orient='index').fillna(0)
csv_path = f"{PDF_DIR}survey_results_fixed.csv"
df.to_csv(csv_path)
print(f"Exported: {csv_path}")
files.download(csv_path)  # Auto-download

In [None]:
#Cell 10 - Visualize ROIs
def debug_extract(pdf_path, page_idx=0, save_thresh=False):
    pages = convert_from_path(pdf_path, dpi=300)
    img_cv = cv2.cvtColor(np.array(pages[page_idx]), cv2.COLOR_RGB2BGR)
    thresh = preprocess_image(img_cv)

    # Draw ROIs
    for q, rois in CHECKBOX_ROIS.get(page_idx, {}).items():
        for col, roi in enumerate(rois, 1):
            x, y, w, h = roi
            color = (0, 255, 0) if is_filled(roi, thresh) else (0, 0, 255)
            cv2.rectangle(img_cv, (x, y), (x+w, y+h), color, 2)
            cv2.putText(img_cv, f"Q{q}{chr(64+col)}", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1)

    # Show original with boxes
    from google.colab.patches import cv2_imshow
    print("Original with ROIs (green=filled, red=empty):")
    cv2_imshow(img_cv)

    # Show threshold (black=potential fill)
    print("Threshold image (black areas = detected marks):")
    cv2_imshow(thresh)

    if save_thresh:
        cv2.imwrite('thresh_debug.png', thresh)
        files.download('thresh_debug.png')

# Run on first PDF
debug_extract(pdf_paths[0])