In [None]:
"""

text2trait: Big Five Personality Trait Classification using GPT-4o

This script processes a semantically chunked stream-of-consciousness text dataset to classify the presence or absence
of the Big Five personality traits (Openness, Conscientiousness, Extroversion, Agreeableness, Neuroticism)
in text chunks using OpenAI's GPT-4o model.

Condition: TEXT ONLY : BASELINE (Zero-shot prompting with text)

The process involves:
- Building a detailed prompt that asks GPT-4o to classify all Big Five traits in a single JSON response.
- Requesting the model to classify all five personality traits individually, in one request.
- Storing the model's structured JSON output incrementally to avoid data loss.

"""

import os
import json
import openai
from tqdm.auto import tqdm
import time

###############################################################################
# CONFIGURE OPENAI CLIENT FOR V1.0+ USAGE
###############################################################################

# API key:
openai.api_key = "OPENAI-API-KEY"  # Replace with actual API key

# Model name
MODEL_NAME = "gpt-4o-2024-11-20"

# Optional: Set custom endpoint or organization ID if required
# openai.api_base = "https://api.openai.com/v1"
# openai.organization = "<ORG_ID>"

###############################################################################
# TRAIT LIST AND PROMPT
###############################################################################

TRAIT_LIST = [
    "Openness to Experience",
    "Conscientiousness",
    "Extroversion",
    "Agreeableness",
    "Neuroticism"
]

def build_prompt_for_all_traits(text_chunk: str) -> str:
    """
    Request classification of all Big Five traits in a single JSON response.
    """
    traits_str = "\n".join(f"- {t}" for t in TRAIT_LIST)
    prompt = f"""You are an intelligent and disciplined assistant trained to determine
    the presence or absence of each of the Big Five personality traits in
    a stream-of-consciousness text. The traits are:
    {traits_str}

    **Your output must be valid JSON** with the structure:

    {{
      "traits": [
        {{
          "trait": "Openness to Experience",
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        {{
          "trait": "Extroversion",
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        ...
      ]
    }}

    Rules:
    1. If insufficient data, set 'result'='NaN', 'result_justification'='NaN',
       'confidence_score'=0.0, 'confidence_score_justification'='NaN'.
    2. No extra text, no code fences, no keys beyond what is shown.
    3. The 'result' MUST be 'y', 'n', or 'NaN'.

    The text: {text_chunk}
    """
    return prompt.strip()

def classify_text_chunk(text_chunk: str) -> str:
    """
    Calls the new v1.0+ method openai.chat.completions.create
    with response_format for structured JSON.
    """
    user_prompt = build_prompt_for_all_traits(text_chunk)
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful, disciplined assistant that outputs JSON only."
            },
            {
                "role": "user",
                "content": user_prompt
            },
        ],
        response_format={"type": "json_object"},
        temperature=1.0,
        max_tokens=1200
    )
    # Extract the JSON from the model
    raw_json_str = response.choices[0].message.content
    return raw_json_str

###############################################################################
# MAIN LOGIC: READ LINES, CALL MODEL, SAVE RESULTS INCREMENTALLY
###############################################################################

def main():
    input_json = "full_chunked_local_minima_pass_2_0.40.json"
    output_file = "structured_big5_results_gpt-4o_temp_1.0.json"
    total_lines_to_process = 2000

    if not os.path.exists(input_json):
        print(f"[ERROR] Input file does not exist: {input_json}")
        return

    # Get the number of lines already processed
    processed_count = 0
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as out_f:
            for _ in out_f:  # Count the lines
                processed_count += 1

    print(f"[INFO] Found {processed_count} lines already processed.")

    # Store results line-by-line
    with open(output_file, "a", encoding="utf-8") as out_f:  # Open in append mode
        with open(input_json, "r", encoding="utf-8") as f:
            # Create a tqdm progress bar that will process up to `total_lines_to_process`
            with tqdm(total=total_lines_to_process, desc="Processing Chunks", initial=processed_count) as pbar:
                for i, line in enumerate(f):
                    if i < processed_count:
                        continue  # Skip already processed lines

                    if processed_count >= total_lines_to_process:
                        break  # Stop if total lines reached

                    row = json.loads(line)
                    author_id = row.get("#AUTHID", "")
                    chunk_number = row.get("Chunk Number", "")
                    text_chunk = row.get("TEXT", "")

                    print(f"\n[PROCESSING] Row {i} => Author: {author_id}, Chunk: {chunk_number}")

                    try:
                        raw_json = classify_text_chunk(text_chunk)
                        print("[RAW MODEL OUTPUT]\n", raw_json)

                        # Parse the JSON to verify it's valid
                        try:
                            parsed = json.loads(raw_json)
                        except json.JSONDecodeError as e:
                            print(f"[ERROR] Could not parse JSON for row {i}: {e}")
                            parsed = {"error": "Invalid JSON", "exception": str(e)}

                        # Save the final record
                        record = {
                            "author_id": author_id,
                            "chunk_number": chunk_number,
                            "model_output": parsed
                        }
                        out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

                        processed_count += 1
                        pbar.update(1)  # Update the progress bar

                    except Exception as e:
                        print(f"[ERROR] In call to model: {e}")
                        record = {
                            "author_id": author_id,
                            "chunk_number": chunk_number,
                            "model_output": {"error": str(e)}
                        }
                        out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

                    # Optional: Add a small delay to avoid rate limits
                    time.sleep(0.5)

    print(f"\n[DONE] Wrote results to {output_file}")

if __name__ == "__main__":
    main()
