In [None]:
import os

# WSL path to project directory
wsl_path = "/mnt/g/My Drive/Sophia/MSC thesis/final_datasets"

# Change current working directory to the WSL path
os.chdir(wsl_path)

# Print the current working directory to verify the change
print("Current working directory:", os.getcwd())


In [None]:
import os
import json
import openai
from tqdm.auto import tqdm
import time

###############################################################################
# CONFIGURE OPENAI CLIENT FOR V1.0+ USAGE
###############################################################################

# API key:
openai.api_key = "OPENAI-API-KEY"  # Replace with actual API key

# Model name
MODEL_NAME = "gpt-4o-2024-11-20"

###############################################################################
# TRAIT LIST AND PROMPT
###############################################################################

TRAIT_LIST = [
    "Openness to Experience",
    "Conscientiousness",
    "Extroversion",
    "Agreeableness",
    "Neuroticism"
]

def load_semantic_features(filepath: str) -> dict:
    """Loads semantic features from a JSON file and creates a lookup dictionary."""
    with open(filepath, "r", encoding="utf-8") as f:
        data = {}
        for line in f:
            try:
                record = json.loads(line)
                author_id = record['author_id']
                chunk_number = record['chunk_number']

                # Check for the presence of 'model_output' and 'features' keys
                if 'model_output' in record and 'features' in record['model_output']:
                    features = record['model_output']['features']
                    key = (author_id, chunk_number)
                    data[key] = features
                else:
                    print(f"Warning: Missing 'model_output' or 'features' in record: {record}")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                print(f"Problematic line: {line.strip()}")
    return data

def build_prompt_for_all_traits(text_chunk: str, semantic_features_list: list) -> str:
    """
    Request classification of all Big Five traits in a single JSON response, including semantic features.
    """
    traits_str = "\n".join(f"- {t}" for t in TRAIT_LIST)
    sanitized_text_chunk = text_chunk.replace("{", "{{").replace("}", "}}")

    # Create a formatted string of semantic features
    semantic_features_str = ""
    for feature in semantic_features_list:
        feature_name = feature['feature']
        reasoning_steps = '\\n'.join(feature['reasoning_steps'])
        result = feature['result']
        result_justification = feature['result_justification']
        confidence_score = feature['confidence_score']
        confidence_score_justification = feature['confidence_score_justification']

        semantic_features_str += f"  - **{feature_name}:**\\n"
        semantic_features_str += f"    - Reasoning Steps: {reasoning_steps}\\n"
        semantic_features_str += f"    - Result: {result}\\n"
        semantic_features_str += f"    - Result Justification: {result_justification}\\n"
        semantic_features_str += f"    - Confidence Score: {confidence_score}\\n"
        semantic_features_str += f"    - Confidence Score Justification: {confidence_score_justification}\\n"

    prompt = f"""You are an intelligent and disciplined assistant trained to determine
    the presence or absence of each of the Big Five personality traits in
    a stream-of-consciousness text. The traits are: {traits_str}

    **You will be provided with the text and semantic features extracted from the text. Consider these inputs in your analysis.**

    **Your task is to reason through each trait step-by-step, explaining how the text and the relevant semantic features provide evidence (or lack thereof) for each trait before determining the final result and confidence score.**

    **Your output must be valid JSON** with the structure:

    {{
      "traits": [
        {{
          "trait": "Openness to Experience",
          "reasoning_steps": [
            "..."
          ],
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        {{
          "trait": "Conscientiousness",
          "reasoning_steps": [
            "..."
          ],
          "result": "y|n|NaN",
          "result_justification": "...",
          "confidence_score": 0.0,
          "confidence_score_justification": "..."
        }},
        ...
      ]
    }}

    Rules:
    1. For each trait, reason step-by-step as described, then provide the final evaluation in the JSON format.
    2. Consider the provided semantic features in your analysis and reasoning.
    3. If insufficient data, set 'result'='NaN', 'result_justification'='NaN',
       'confidence_score'=0.0, 'confidence_score_justification'='NaN'.
    4. No extra text, no code fences, no keys beyond what is shown.
    5. The 'result' must be 'y', 'n', or 'NaN'.
    6. Provide detailed justifications for the reasoning, results, and confidence scores.

    The text: {sanitized_text_chunk}

    The semantic features: {semantic_features_str}
    """
    return prompt.strip()

def classify_text_chunk(text_chunk: str, semantic_features: list) -> str:
    """
    Calls the new v1.0+ method openai.chat.completions.create
    with response_format for structured JSON.
    """
    user_prompt = build_prompt_for_all_traits(text_chunk, semantic_features)
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful, disciplined assistant that outputs JSON only."
            },
            {
                "role": "user",
                "content": user_prompt
            },
        ],
        response_format={"type": "json_object"},
        temperature=1.0,
        max_tokens=2000
    )
    raw_json_str = response.choices[0].message.content
    return raw_json_str

###############################################################################
# MAIN LOGIC: READ LINES, CALL MODEL, SAVE RESULTS INCREMENTALLY
###############################################################################

def main():
    input_json = "full_chunked_local_minima_pass_2_0.40.json"
    semantic_features_file = "semantic_feature_extraction/cot-structured_feature_extraction_openai_gpt-4o_temp_1.0.json"
    output_file = f"big5_classification/big5_semantic_features_classification/semfeat_cot_big5_openai_gpt-4o_temp_1.0.json"
    total_lines_to_process = 2000

    if not os.path.exists(input_json):
        print(f"[ERROR] Input file does not exist: {input_json}")
        return

    if not os.path.exists(semantic_features_file):
        print(f"[ERROR] Semantic features file does not exist: {semantic_features_file}")
        return

    semantic_features_lookup = load_semantic_features(semantic_features_file)

    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            processed_count = sum(1 for _ in f)
        print(f"[INFO] Found {processed_count} lines already processed.")
    else:
        processed_count = 0

    with open(output_file, "a", encoding="utf-8") as out_f:
        with open(input_json, "r", encoding="utf-8") as f:
            with tqdm(total=total_lines_to_process, desc="Processing Chunks", initial=processed_count) as pbar:
                for i, line in enumerate(f):
                    if i >= processed_count:
                        row = json.loads(line)
                        author_id = row.get("#AUTHID", "")
                        chunk_number = row.get("Chunk Number", "")
                        text_chunk = row.get("TEXT", "")

                        print(f"\n[PROCESSING] Row {i} => Author: {author_id}, Chunk: {chunk_number}")

                        sem_features_key = (author_id, chunk_number)
                        semantic_features = semantic_features_lookup.get(sem_features_key, [])

                        try:
                            raw_json = classify_text_chunk(text_chunk, semantic_features)
                            print("[RAW MODEL OUTPUT]\n", raw_json)

                            try:
                                parsed = json.loads(raw_json)
                            except json.JSONDecodeError as e:
                                print(f"[ERROR] Could not parse JSON for row {i}: {e}")
                                parsed = {"error": "Invalid JSON", "exception": str(e)}

                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": parsed
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                            out_f.flush()
                            processed_count += 1
                            pbar.update(1)

                        except Exception as e:
                            print(f"[ERROR] In call to model: {e}")
                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": {"error": str(e)}
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                            out_f.flush()

                    if processed_count >= total_lines_to_process:
                        break

    print(f"\n[DONE] Wrote results to {output_file}")

if __name__ == "__main__":
    main()
