<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies
!apt-get update -qq
!apt-get install -y tesseract-ocr poppler-utils -qq
!pip install -q pdf2image huggingface_hub openai pillow pymupdf

In [None]:
# Cell 2: Imports
import os
import glob
import base64
import json
from io import BytesIO
from pathlib import Path
from collections import defaultdict
from pdf2image import convert_from_path
from openai import OpenAI
from huggingface_hub import snapshot_download
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata

In [None]:
# Cell 3: Configuration
# Replace with your values
HF_TOKEN = userdata.get('HF_TOKEN')  # Your HF token for private repo
REPO_ID = userdata.get('REPO_ID')  # e.g., "user/private-survey-pdfs"
XAI_API_KEY = userdata.get('XAI_API_KEY')  # Get from https://console.x.ai/
MODEL = "grok-4"  # Or "grok-4-fast-reasoning" for cheaper/faster
NUM_QUESTIONS = 20  # Corrected to 20 based on PDFs
PDF_DIR = userdata.get('PDF_FOLDER_PATH')  # Or set to "/content/pdfs"

In [None]:
# Cell 4: Download PDFs from private HF repo
os.makedirs(PDF_DIR, exist_ok=True)
pdf_dir = snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PDF_DIR,
    repo_type="dataset",
    allow_patterns=["*.pdf"],
    ignore_patterns=["*.json", "*.md", "*config*"]
)
print(f"Downloaded PDFs to {pdf_dir}")

pdf_paths = sorted(glob.glob(f"{pdf_dir}/**/*.pdf", recursive=True))
print(f"Found {len(pdf_paths)} PDF files")

In [None]:
# Cell 5: Initialize xAI client
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1"
)

In [None]:
# Cell 6: Extract answers using Grok-4 Vision
def pdf_to_base64(pdf_path):
    """Convert all pages of PDF to base64 PNGs with higher DPI"""
    try:
        images = convert_from_path(pdf_path, dpi=300)  # Increase DPI for better detection
        b64s = []
        for img in images:
            buffered = BytesIO()
            img.save(buffered, format="PNG")
            b64s.append(base64.b64encode(buffered.getvalue()).decode())
        return b64s
    except Exception as e:
        print(f"Failed to convert {pdf_path}: {e}")
        return []

def extract_answers(pdf_path):
    """Use Grok-4 to extract answers from all PDF page images"""
    b64s = pdf_to_base64(pdf_path)
    if not b64s:
        return None

    prompt_text = """You are an expert at reading checkbox-based survey forms.
This is a scanned/image employee survey form spanning up to 2 pages with exactly 20 numbered questions (1-20).
Ignore section headers (e.g., "Safe Working Environment") even if they have checkboxes.
Each question is in a table row: question text on the left, then 4 square checkboxes on the right.
From left to right: Strongly Agree (A), Agree (B), Disagree (C), Strongly Disagree (D).
The selected choice is a filled black square (■); unselected are empty (☐).
Identify the single selected choice for each question 1-20.
If none selected, multiple selected, or unclear, use "X".
Return ONLY a valid JSON array of 20 uppercase letters, e.g., ["A", "B", "C", ..., "D"].
Do not add any other text or explanations."""

    content = [{"type": "text", "text": prompt_text}]
    for b64 in b64s:
        content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": content}],
            max_tokens=200,  # Slightly increased for safety
            temperature=0  # Deterministic for vision tasks
        )

        content = response.choices[0].message.content.strip()
        answers = json.loads(content)
        if isinstance(answers, list) and len(answers) == NUM_QUESTIONS:
            return [a.upper() for a in answers]
        else:
            print(f"Invalid format from {pdf_path}: {content}")
            return None
    except json.JSONDecodeError:
        print(f"JSON parse error from {pdf_path}: {content}")
        return None
    except Exception as e:
        print(f"API error for {pdf_path}: {e}")
        return None

# Process all PDFs
all_answers = []
for i, pdf_path in enumerate(pdf_paths):
    print(f"Processing {i+1}/{len(pdf_paths)}: {Path(pdf_path).name}")
    answers = extract_answers(pdf_path)
    if answers:
        all_answers.append(answers)
    else:
        print(f"Skipped {pdf_path} due to extraction failure")

print(f"Successfully processed {len(all_answers)} / {len(pdf_paths)} PDFs")

In [None]:
# Cell 7: Aggregate percentages (unchanged, but now uses correct NUM_QUESTIONS)
question_stats = defaultdict(lambda: defaultdict(int))
for answers in all_answers:
    for q_idx, choice in enumerate(answers, 1):
        question_stats[q_idx][choice] += 1

percentages = {}
total_responses = len(all_answers)
for q, counts in question_stats.items():
    total_q = sum(counts.values())
    if total_q > 0:
        percs = {choice: (count / total_q * 100) for choice, count in counts.items()}
        percentages[q] = dict(sorted(percs.items(), key=lambda x: x[1], reverse=True))

print(f"Total responses: {total_responses}")
print("Percentages per question:")
for q, percs in percentages.items():
    print(f"Q{q}: {percs}")

In [None]:
# Cell 8: Generate summary report with Grok-4 (unchanged, but now accurate data)
summary_prompt = f"""You are a data analyst expert.
Here is the survey results from {total_responses} respondents for {NUM_QUESTIONS} multiple-choice questions.

Percentages (most popular first):
{json.dumps(percentages, indent=2)}

Generate a professional summary report:
1. Overall insights: most consistent questions, popular choices.
2. Table of top answer % per question.
3. Any patterns or anomalies.
4. Visual description (as if charts).
5. Recommendations if it were a survey.

Use markdown with tables. Be concise yet insightful."""

response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": summary_prompt}],
    max_tokens=2000,
    temperature=0.3
)

print("### Grok-4 Expert Summary Report")
print(response.choices[0].message.content)

In [None]:
# Cell 9: Local visualization (bar charts for each question) - adjusted for NUM_QUESTIONS
rows = (NUM_QUESTIONS // 5) + (1 if NUM_QUESTIONS % 5 else 0)
fig, axes = plt.subplots(rows, 5, figsize=(20, 4 * rows))
axes = axes.ravel()

for idx, (q, percs) in enumerate(percentages.items()):
    choices = list(percs.keys())
    vals = list(percs.values())
    axes[idx].bar(choices, vals, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
    axes[idx].set_title(f'Q{q} (%)')
    axes[idx].set_ylim(0, 100)

for idx in range(len(percentages), len(axes)):
    axes[idx].axis('off')  # Hide unused subplots

plt.tight_layout()
plt.show()

# Export to CSV
df = pd.DataFrame(percentages).T.fillna(0)
df.to_csv(f'{PDF_DIR}/survey_percentages.csv')
print("Exported to survey_percentages.csv")