In [None]:
import os

# WSL path to project directory
wsl_path = "/mnt/g/My Drive/Sophia/MSC thesis"

# Change current working directory to the WSL path
os.chdir(wsl_path)

# Print the current working directory to verify the change
print("Current working directory:", os.getcwd())


In [None]:
import os
import json
import openai
from tqdm.auto import tqdm
import time

###############################################################################
# CONFIGURE OPENAI CLIENT FOR V1.0+ USAGE
###############################################################################

# API key:
openai.api_key = "OPENAI-API-KEY"  # Replace with actual API key

# Model name
MODEL_NAME = "gpt-4o-2024-11-20"

###############################################################################
# TRAIT LIST AND PROMPT
###############################################################################

TRAIT_LIST = [
    "Openness to Experience",
    "Conscientiousness",
    "Extroversion",
    "Agreeableness",
    "Neuroticism"
]

def load_programmatic_features(filepath: str) -> dict:
    """Loads programmatic features from a JSON file and creates a lookup dictionary."""
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    features_lookup = {}
    for item in data:
        key = (item["#AUTHID"], item["Chunk Number"])
        features_lookup[key] = item["features_text"]
    return features_lookup

def build_prompt_for_all_traits(text_chunk: str, programmatic_features: str) -> str:
    """
    Request classification of all Big Five traits in a single JSON response, including programmatic features.
    """
    traits_str = "\n".join(f"- {t}" for t in TRAIT_LIST)
    # Escape curly braces
    sanitized_text_chunk = text_chunk.replace("{", "{{").replace("}", "}}")
    sanitized_programmatic_features = programmatic_features.replace("{", "{{").replace("}", "}}")

    prompt = f"""You are an intelligent and disciplined assistant trained to determine
    the presence or absence of each of the Big Five personality traits in
    a stream-of-consciousness text. The traits are:
    {traits_str}

    **You will be provided with the text and programmatic features extracted from the text. Consider these inputs in your analysis.**

    **Your task is to reason through each trait step-by-step, explaining how the text and the relevant programmatic features provide evidence (or lack thereof) for each trait before determining the final result and confidence score.**

    **Your output must be valid JSON** with the structure:

    {{
      "traits": [
        {{
          "trait": "Openness to Experience",
          "reasoning_steps": [
            "..."
          ],
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        {{
          "trait": "Conscientiousness",
          "reasoning_steps": [
            "..."
          ],
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        ...
      ]
    }}

    Rules:
    1. For each trait, reason step-by-step as described, then provide the final evaluation in the JSON format.
    2. Consider the provided programmatic features in your analysis and reasoning.
    3. If insufficient data, set 'result'='NaN', 'result_justification'='NaN',
       'confidence_score'=0.0, 'confidence_score_justification'='NaN'.
    4. No extra text, no code fences, no keys beyond what is shown.
    5. The 'result' MUST be 'y', 'n', or 'NaN'.
    6. Provide detailed justifications for the reasoning, results, and confidence scores.

    The text: {sanitized_text_chunk}

    The programmatic features: {sanitized_programmatic_features}
    """
    return prompt.strip()

def classify_text_chunk(text_chunk: str, programmatic_features: str) -> str:
    """
    Calls the new v1.0+ method openai.chat.completions.create
    with response_format for structured JSON.
    """
    user_prompt = build_prompt_for_all_traits(text_chunk, programmatic_features)
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful, disciplined assistant that outputs JSON only."
            },
            {
                "role": "user",
                "content": user_prompt
            },
        ],
        response_format={"type": "json_object"},
        temperature=1.0,
        max_tokens=2000
    )
    # Extract the JSON from the model
    raw_json_str = response.choices[0].message.content
    return raw_json_str

###############################################################################
# MAIN LOGIC: READ LINES, CALL MODEL, SAVE RESULTS INCREMENTALLY
###############################################################################

def main():
    input_json = "full_chunked_local_minima_pass_2_0.40.json"
    programmatic_features_file = "full_programmatic_features_extracted_updated_2.json"
    output_file = "cot_prog_features_big5-4o_temp_1.0.json"
    total_lines_to_process = 2000

    if not os.path.exists(input_json):
        print(f"[ERROR] Input file does not exist: {input_json}")
        return

    if not os.path.exists(programmatic_features_file):
        print(f"[ERROR] Programmatic features file does not exist: {programmatic_features_file}")
        return

    # Load programmatic features
    programmatic_features_lookup = load_programmatic_features(programmatic_features_file)

    # --- Check processed lines at the start of the script ---
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            processed_count = sum(1 for _ in f)
        print(f"[INFO] Found {processed_count} lines already processed.")
    else:
        processed_count = 0

    # Store results line-by-line
    with open(output_file, "a", encoding="utf-8") as out_f:  # Open in append mode
        with open(input_json, "r", encoding="utf-8") as f:
            # Create a tqdm progress bar that will process up to `total_lines_to_process`
            with tqdm(total=total_lines_to_process, desc="Processing Chunks", initial=processed_count) as pbar:
                for i, line in enumerate(f):
                    # Process lines only if they haven't been processed already
                    if i >= processed_count:
                        row = json.loads(line)
                        author_id = row.get("#AUTHID", "")
                        chunk_number = row.get("Chunk Number", "")
                        text_chunk = row.get("TEXT", "")

                        print(f"\n[PROCESSING] Row {i} => Author: {author_id}, Chunk: {chunk_number}")

                        # Get programmatic features for this chunk
                        prog_features_key = (author_id, chunk_number)
                        programmatic_features = programmatic_features_lookup.get(prog_features_key, "No programmatic features found.")

                        try:
                            raw_json = classify_text_chunk(text_chunk, programmatic_features)
                            print("[RAW MODEL OUTPUT]\n", raw_json)

                            # Parse the JSON to verify it's valid
                            try:
                                parsed = json.loads(raw_json)
                            except json.JSONDecodeError as e:
                                print(f"[ERROR] Could not parse JSON for row {i}: {e}")
                                parsed = {"error": "Invalid JSON", "exception": str(e)}

                            # Save the final record
                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": parsed
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                            processed_count += 1
                            pbar.update(1)  # Update the progress bar

                        except Exception as e:
                            print(f"[ERROR] In call to model: {e}")
                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": {"error": str(e)}
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

                    # Exit loop if total lines to process is reached
                    if processed_count >= total_lines_to_process:
                        break

    print(f"\n[DONE] Wrote results to {output_file}")

if __name__ == "__main__":
    main()
