<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/EmployeeSurvey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Employee Survey Processor Using Grok4

Instructions:

Run cells sequentially.
Tokens: Replace HF_TOKEN, REPO_ID, XAI_API_KEY.
HF Repo: Ensure your private repo contains only PDFs (or patterns match).
Cost: ~1000 API calls for extraction + 1 summary. Use grok-4-fast-reasoning to save ~90%.
Multi-page PDFs: Modify pdf_to_base64 to send multiple images (append more image_url).
Errors: Check console for failed parses; manually review if needed.
Scale: For 1000 PDFs, ~1-2 hours runtime.

This code automatically:

Downloads 1000+ PDFs.
Uses Grok-4 Vision to parse scanned sheets.
Computes exact % per choice/question.
Generates AI expert summary.
Plots interactive charts + CSV export.

In [None]:
# Cell 1: Install dependencies
!apt-get update -qq
!apt-get install -y tesseract-ocr poppler-utils -qq
!pip install -q pdf2image huggingface_hub openai pillow pymupdf

In [None]:
# Cell 2: Imports
import os
import glob
import base64
import json
from io import BytesIO
from pathlib import Path
from collections import defaultdict, Counter
from pdf2image import convert_from_path
from openai import OpenAI
from huggingface_hub import snapshot_download
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import userdata

In [None]:
# Cell 3: Configuration
# Replace with your values
HF_TOKEN = userdata.get('HF_TOKEN')  # Your HF token for private repo
REPO_ID = userdata.get('REPO_ID')  # e.g., "user/private-survey-pdfs"
XAI_API_KEY = userdata.get('XAI_API_KEY')  # Get from https://console.x.ai/
MODEL = "grok-4"  # Or "grok-4-fast-reasoning" for cheaper/faster
NUM_QUESTIONS = 25
PDF_DIR = "/content/pdfs"

In [None]:
# Cell 4: Download PDFs from private HF repo
os.makedirs(PDF_DIR, exist_ok=True)
pdf_dir = snapshot_download(
    repo_id=REPO_ID,
    token=HF_TOKEN,
    local_dir=PDF_DIR,
    repo_type="dataset",  # Change to "model" if it's a model repo
    allow_patterns=["*.pdf"],
    ignore_patterns=["*.json", "*.md", "*config*"]
)
print(f"Downloaded PDFs to {pdf_dir}")

pdf_paths = sorted(glob.glob(f"{pdf_dir}/**/*.pdf", recursive=True))
print(f"Found {len(pdf_paths)} PDF files")

In [None]:
# Cell 5: Initialize xAI client
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1"
)

In [None]:
# Cell 6: Extract answers using Grok-4 Vision
def pdf_to_base64(pdf_path, page_num=0):
    """Convert first page of PDF to base64 PNG"""
    images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
    if not images:
        return None
    img = images[0]
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()

def extract_answers(pdf_path):
    """Use Grok-4 to extract answers from PDF image"""
    b64 = pdf_to_base64(pdf_path)
    if not b64:
        print(f"Failed to process {pdf_path}")
        return None

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """You are an expert at reading multiple-choice answer sheets.
This is a scanned/image answer sheet for a 25-question survey.
Each question has choices A, B, C, D (possibly E).
Identify the marked/bubbled/chosen answer for each question 1-25.
Return ONLY a valid JSON array of 25 uppercase letters, e.g., ["A", "B", "C", ...].
If unclear, use "X". Do not add any other text."""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{b64}"}
                    }
                ]
            }
        ],
        max_tokens=100,
        temperature=0.1
    )

    content = response.choices[0].message.content.strip()
    try:
        answers = json.loads(content)
        if isinstance(answers, list) and len(answers) == NUM_QUESTIONS:
            return [a.upper() for a in answers]
        else:
            print(f"Invalid format from {pdf_path}: {content}")
            return None
    except json.JSONDecodeError:
        print(f"JSON parse error from {pdf_path}: {content}")
        return None

# Process all PDFs
all_answers = []
for i, pdf_path in enumerate(pdf_paths):
    print(f"Processing {i+1}/{len(pdf_paths)}: {Path(pdf_path).name}")
    answers = extract_answers(pdf_path)
    if answers:
        all_answers.append(answers)

print(f"Successfully processed {len(all_answers)} / {len(pdf_paths)} PDFs")

In [None]:
# Cell 7: Aggregate percentages
question_stats = defaultdict(lambda: defaultdict(int))
for answers in all_answers:
    for q_idx, choice in enumerate(answers, 1):
        question_stats[q_idx][choice] += 1

# Compute percentages
percentages = {}
total_responses = len(all_answers)
for q, counts in question_stats.items():
    total_q = sum(counts.values())
    percs = {choice: (count / total_q * 100) for choice, count in counts.items()}
    percentages[q] = dict(sorted(percs.items(), key=lambda x: x[1], reverse=True))

print(f"Total responses: {total_responses}")
print("Percentages per question:")
for q, percs in percentages.items():
    print(f"Q{q}: {percs}")

In [None]:
# Cell 8: Generate summary report with Grok-4
summary_prompt = f"""You are a data analyst expert.
Here is the survey results from {total_responses} respondents for 25 multiple-choice questions.

Percentages (most popular first):
{json.dumps(percentages, indent=2)}

Generate a professional summary report:
1. Overall insights: most consistent questions, popular choices.
2. Table of top answer % per question.
3. Any patterns or anomalies.
4. Visual description (as if charts).
5. Recommendations if it were a survey.

Use markdown with tables. Be concise yet insightful."""

response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": summary_prompt}],
    max_tokens=2000,
    temperature=0.3
)

print("### Grok-4 Expert Summary Report")
print(response.choices[0].message.content)

In [None]:
# Cell 9: Local visualization (bar charts for each question)
fig, axes = plt.subplots(5, 5, figsize=(20, 20))
axes = axes.ravel()

for idx, (q, percs) in enumerate(percentages.items()):
    choices = list(percs.keys())
    vals = list(percs.values())
    axes[idx].bar(choices, vals, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
    axes[idx].set_title(f'Q{q} (%)')
    axes[idx].set_ylim(0, 100)

plt.tight_layout()
plt.show()

# Export to CSV
df = pd.DataFrame(percentages).T.fillna(0)
df.to_csv('/content/survey_percentages.csv')
print("Exported to /content/survey_percentages.csv")