<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

two page survey read out of googe drive folder, not huggingface

In [None]:
# CELL 1 – Install
!apt-get update -qq
!apt-get install -y poppler-utils -qq
!pip install -q opencv-python pandas matplotlib openai huggingface_hub

In [None]:
# CELL 2 – Imports & secrets
import os, glob, cv2, numpy as np, json, re
from pathlib import Path
from collections import defaultdict
from huggingface_hub import snapshot_download
from openai import OpenAI
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata

In [None]:
# CELL  3a – link to PNGs in google drive
HF_TOKEN = userdata.get('HF_TOKEN')      # your HF token
REPO_ID  = userdata.get('REPO_ID')       # e.g. "you/survey-pngs"
PNG_DIR  = userdata.get('PNG_TARGET_PATH')

png_paths = sorted(glob.glob(f"{PNG_DIR}/**/*.png", recursive=True))
print(f"Found {len(png_paths)} PNGs")

In [None]:
# CELL 3b with HuggingFace don't run if run 3a – Download PNGs from HF (recursive)
HF_TOKEN = userdata.get('HF_TOKEN')      # your HF token
REPO_ID  = userdata.get('REPO_ID')       # e.g. "you/survey-pngs"
PNG_DIR  = "/content/survey_pngs"        # mount google drive

os.makedirs(PNG_DIR, exist_ok=True)
snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PNG_DIR,
    repo_type="dataset",
    allow_patterns=["**/*.png"],
    ignore_patterns=["*.json","*.md"]
)

png_paths = sorted(glob.glob(f"{PNG_DIR}/**/*.png", recursive=True))
print(f"Downloaded {len(png_paths)} PNGs")

In [None]:
# CELL 4 – Pair pages
survey_dict = defaultdict(dict)
pattern = re.compile(r"(.+?)_?[ -]?page0*(\d{1,3})\.png$", re.IGNORECASE)

for p in png_paths:
    m = pattern.search(Path(p).name)
    if m:
        name, page = m.group(1).strip(), int(m.group(2))
        survey_dict[name][page] = p

valid_surveys = {k: v for k, v in survey_dict.items() if 1 in v and 2 in v}
print(f"{len(valid_surveys)} complete surveys")

In [None]:
# CELL 5 – Pixel-perfect ROI
COLS = {'A':(750,38,38), 'B':(910,38,38), 'C':(1070,38,38), 'D':(1230,38,38)}

Y_PAGE1 = [208,253,298,343,388,433,478,523,568,613,
           658,703,748,793,838,883,928,973,1018,1063]

ROIS_PAGE1 = {}
for q,y in enumerate(Y_PAGE1,1):
    ROIS_PAGE1[q] = {c:(COLS[c][0],y,COLS[c][1],COLS[c][2]) for c in 'ABCD'}

ROIS_PAGE2 = {
    21: {c:(v[0],208,v[1],v[2]) for c,v in ROIS_PAGE1[1].items()},
    22: {c:(v[0],253,v[1],v[2]) for c,v in ROIS_PAGE1[1].items()},
    23: {c:(v[0],298,v[1],v[2]) for c,v in ROIS_PAGE1[1].items()}
}

In [None]:
# CELL 6 – Ultra-aggressive preprocessing (tiny Xs become black)
def preprocess(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    eq = cv2.equalizeHist(gray)
    clahe = cv2.createCLAHE(clipLimit=6.0, tileGridSize=(8,8))
    enhanced = clahe.apply(eq)
    kernel = np.ones((11,11), np.uint8)   # huge kernel = catches dots
    closed = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
    thresh = cv2.adaptiveThreshold(closed, 255,
                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 71, 30)
    return thresh

In [None]:
# CELL 7 – Detect faint marks
def is_filled(roi, thresh, ratio=0.18):   # 18 % dark = mark
    x,y,w,h = roi
    crop = thresh[y:y+h, x:x+w]
    if crop.size==0: return False
    return np.sum(crop==0)/(w*h) >= ratio

In [None]:
# CELL 8 – Extract one survey
def extract(p1, p2):
    answers = ['X']*23
    # Page 1
    img1 = cv2.imread(p1)
    t1 = preprocess(img1)
    for q in range(1,21):
        for c in 'ABCD':
            if is_filled(ROIS_PAGE1[q][c], t1):
                answers[q-1] = c
                break
    # Page 2
    img2 = cv2.imread(p2)
    t2 = preprocess(img2)
    for q in range(21,24):
        for c in 'ABCD':
            if is_filled(ROIS_PAGE2[q][c], t2):
                answers[q-1] = c
                break
    return answers

In [None]:
# CELL 9 – RUN
all_answers = []
for name, pages in valid_surveys.items():
    print(f"{name}")
    ans = extract(pages[1], pages[2])
    all_answers.append(ans)
    print("".join(ans))

print(f"\n{len(all_answers)} surveys parsed")

In [None]:
# CELL 10 – Aggregate
stats = defaultdict(lambda: defaultdict(int))
for row in all_answers:
    for q,ch in enumerate(row,1):
        if ch!='X': stats[q][ch] += 1

percentages = {}
for q,cnt in stats.items():
    tot = sum(cnt.values())
    per = {c: round(v/tot*100,1) for c,v in cnt.items()}
    percentages[q] = dict(sorted(per.items(), key=lambda x:-x[1]))

print("\nRESULTS:")
for q in sorted(percentages):
    print(f"Q{q:02d}: {percentages[q]}")

In [None]:
# CELL 11 – Grok-4 summary (one call)
client = OpenAI(api_key=userdata.get('XAI_API_KEY'), base_url="https://api.x.ai/v1")
prompt = f"""You are a data-analyst.
{len(all_answers)} employees answered 23 questions (A=Strongly Agree … D=Strongly Disagree).

Percentages:
{json.dumps(percentages, indent=2)}

Write a concise markdown report with:
1. Overall insights
2. Table of top answer per question
3. Patterns / anomalies
4. Visual description
5. 3 actionable recommendations
"""

resp = client.chat.completions.create(
    model="grok-4",
    messages=[{"role":"user","content":prompt}],
    max_tokens=1500,
    temperature=0.3
)
print("\n### Grok-4 Report")
print(resp.choices[0].message.content)

In [None]:
# CELL 12 – Charts + CSV
fig, axs = plt.subplots(5,5,figsize=(20,20))
axs = axs.ravel()
colors = {'A':'#2ca02c','B':'#1f77b4','C':'#ff7f0e','D':'#d62728'}
for i,(q,per) in enumerate(sorted(percentages.items())):
    ch,val = list(per.keys()), list(per.values())
    axs[i].bar(ch,val,color=[colors.get(c,'gray') for c in ch])
    axs[i].set_title(f'Q{q}'); axs[i].set_ylim(0,100)
for ax in axs[len(percentages):]: ax.axis('off')
plt.tight_layout(); plt.show()

df = pd.DataFrame.from_dict(percentages,orient='index').fillna(0)
df.to_csv("hf_png_results.csv")
from google.colab import files
files.download("hf_png_results.csv")