In [None]:
import os

# WSL path to your project directory
wsl_path = "/mnt/g/My Drive/Sophia/MSC thesis"

# Change current working directory to the WSL path
os.chdir(wsl_path)

# Print the current working directory to verify the change
print("Current working directory:", os.getcwd())


In [None]:
import json
import openai
from tqdm.auto import tqdm
import time

###############################################################################
# CONFIGURE OPENAI CLIENT FOR V1.0+ USAGE
###############################################################################

# API key:
openai.api_key = "OPENAI-API-KEY"  # Replace with actual API key

# Model name
MODEL_NAME = "gpt-4o-2024-11-20"

###############################################################################
# FEATURE LIST AND PROMPT STRUCTURE
###############################################################################

FEATURE_LIST = [
    "Cognitive Flexibility",
    "Narrative and Discourse Coherence",
    "Emotional Tone",
    "Self-Reflection Depth",
    "Analytical Thinking",
]

def build_prompt_for_all_features(text_chunk: str) -> str:
    """
    Builds a prompt to classify all psycholinguistic features from the text chunk.
    """
    features_str = "\n".join(f"- {f}" for f in FEATURE_LIST)
    sanitized_text_chunk = text_chunk.replace("{", "{{").replace("}", "}}").strip()

    prompt = f"""CoT Semantic Feature Extraction Prompt

You are an intelligent and disciplined assistant trained to determine the presence and degree of various nuanced psycholinguistic features in a stream-of-consciousness text. The features are:
{features_str}.

*Your task is to reason through each feature step-by-step, explaining how the text provides evidence (or lack thereof) for each feature before determining the final result and confidence score.*

*Your output must be valid JSON* with the structure:

{{
  "features": [
    {{
      "feature": "Cognitive Flexibility",
      "reasoning_steps": [
        "..."
      ],
      "result": "...",
      "result_justification": "...",
      "confidence_score": 0.0,
      "confidence_score_justification": "..."
    }},
    {{
      "feature": "Narrative and Discourse Coherence",
      "reasoning_steps": [
        "..."
      ],
      "result": "...",
      "result_justification": "...",
      "confidence_score": 0.0,
      "confidence_score_justification": "..."
    }},
    ...
  ]
}}

Rules:
1. For each feature, reason step-by-step as described, then provide the final evaluation in the JSON format.
2. If insufficient data, set 'result'='Not able to evaluate', 'result_justification'='Not able to evaluate', 'confidence_score'=0.0, 'confidence_score_justification'='Not able to evaluate'.
3. No extra text, no code fences, no keys beyond what's shown.
4. Provide detailed justifications for the reasoning, results, and confidence scores.

The text: {sanitized_text_chunk}
"""
    return prompt.strip()

def classify_text_chunk(text_chunk: str) -> str:
    """
    Calls the OpenAI API to classify features from the given text chunk.
    """
    user_prompt = build_prompt_for_all_features(text_chunk)
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful, disciplined assistant that outputs JSON only."},
            {"role": "user", "content": user_prompt},
        ],
        response_format={"type": "json_object"},
        temperature=1.0,
        max_tokens=1500
    )
    return response.choices[0].message.content

###############################################################################
# MAIN LOGIC: READ LINES, CALL MODEL, SAVE RESULTS INCREMENTALLY
###############################################################################

def main():
    input_json = "full_chunked_local_minima_pass_2_0.40.json"
    output_file = "cot-structured_feature_extraction_openai_gpt-4o_temp_1.0.json"
    total_lines_to_process = 2000

    if not os.path.exists(input_json):
        print(f"[ERROR] Input file does not exist: {input_json}")
        return

    # Check previously processed lines
    processed_count = 0
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            processed_count = sum(1 for _ in f)
        print(f"[INFO] Found {processed_count} lines already processed.")

    # Process lines and save results incrementally
    with open(output_file, "a", encoding="utf-8") as out_f:
        with open(input_json, "r", encoding="utf-8") as f:
            with tqdm(total=total_lines_to_process, desc="Processing Chunks", initial=processed_count) as pbar:
                for i, line in enumerate(f):
                    if i >= processed_count:
                        row = json.loads(line)
                        author_id = row.get("#AUTHID", "")
                        chunk_number = row.get("Chunk Number", "")
                        text_chunk = row.get("TEXT", "")

                        print(f"\n[PROCESSING] Row {i} => Author: {author_id}, Chunk: {chunk_number}")

                        try:
                            raw_json = classify_text_chunk(text_chunk)
                            print("[RAW MODEL OUTPUT]\n", raw_json)

                            try:
                                parsed = json.loads(raw_json)
                            except json.JSONDecodeError as e:
                                print(f"[ERROR] Could not parse JSON for row {i}: {e}")
                                parsed = {"error": "Invalid JSON", "exception": str(e)}

                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": parsed
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                            out_f.flush()
                            processed_count += 1
                            pbar.update(1)

                        except Exception as e:
                            print(f"[ERROR] In call to model: {e}")
                            record = {
                                "author_id": author_id,
                                "chunk_number": chunk_number,
                                "model_output": {"error": str(e)}
                            }
                            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                            out_f.flush()

                    if processed_count >= total_lines_to_process:
                        break

    print(f"\n[DONE] Wrote results to {output_file}")

if __name__ == "__main__":
    main()