In [264]:
import json
import ast
import matplotlib.pyplot as plt
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from shared import generate
import re
import os

# *Persona Based Analysis* - Workplace #

### **Generate Workplace Persona Data**

In [36]:
def generate_workplace_interaction():
  return generate(model = '4o-mini',
        system = "Generate fictional personas for two individuals who are professional colleagues. Use these personas to write a brief workplace dialogue.",
        query = 'Write a professional workplace interaction between the two colleagues imagined above. Limit the exchange to no more than 4 messages. Do not use names or pronouns. Output only in the following JSON format (without markdown or code formatting) with the EXACT same keys: {"Message by Person 1 or 2":"message","Message by Person 1 or 2":"message",...}.',
        temperature=1.5,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [37]:
def process_dialogue(i):
    return generate_workplace_interaction()['response']

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_dialogue, range(100)))

In [38]:
with open('persona_generated_data/workplace.json', 'w') as f:
    json.dump(results, f, indent=2)

### **Analyse Workplace Persona Data** 

In [15]:
def analyze_workplace_persona(dialogue):
  return generate(model = '4o-mini',
        system = 'Analyze the dialogue as specified below.',
        query = dialogue + '\n\nThe dialogue above features two colleagues. Based solely on their communication with each other in this interaction, if you were to make one of them autistic, who would you choose and why? Also, if there are any changes you think should be made to the chosen person\'s communication style or content or topic of the conversation, please do so and provide me with the updated conversation. Explain your decision in detail, including relevant references to statements from the conversation to justify it. Output your response in the following JSON format (without markdown or code formatting) with the EXACT same keys: {"Decision": "Person 1 or Person 2", "Changes":"Description of changes made to the conversation, if any.", "Updated Conversation":{"Person 1":"their statement...", "Person 2": "their statement..."}, "Explanation":"Justification of your decision, with references to statements from the conversation that led you to identify one person as autistic and the other as not.",}',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [58]:
with open('persona_generated_data/workplace.json') as f:
    results_loaded = json.load(f)

results_loaded[0:2]

['{"Message by Person 1":"The quarterly report is due next week. Have all the metrics been reviewed for accuracy?","Message by Person 2":"I finished analyzing the data yesterday. I\'m just compiling it into the presentation format now.","Message by Person 1":"Great! Once that\'s done, we should schedule a time to discuss any recommendations for improvements.","Message by Person 2":"Sounds perfect. I will aim to have it ready by tomorrow afternoon."}',
 '{"Message by Person 1":"The marketing presentation is due next week. I hope the graphics are ready by tomorrow.","Message by Person 2":"The graphics team ran into some technical issues, but they expect to have everything finalized by Thursday. Hoping that\'s soon enough for us.","Message by Person 1":"That works. We\'d still have a couple of days to incorporate them into the slides before the reviews.","Message by Person 2":"Exactly! I’ll touch base with the graphics team and make sure they stay on track."}']

In [59]:
def process_dialogue(i):
    dialogue = results_loaded[i]
    response = analyze_workplace_persona(dialogue)['response']
    return response

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_dialogue, range(100)))

In [60]:
with open('persona_analysis_data/workplace.json', 'w') as f:
    json.dump(results, f, indent=2)

### **Process Workplace Persona Analysis Data** 

In [3]:
def fix_and_validate_json(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)

    fixed_data = []
    decoder = json.JSONDecoder()

    for i, entry in enumerate(raw_data, 1):
        if not isinstance(entry, str):
            print(f"[Warning] Entry {i} is not a string. Skipping.")
            continue

        fixed = None

        # Try multiple parsing approaches
        for attempt in range(3):
            try:
                if attempt == 0:
                    fixed = json.loads(entry)
                elif attempt == 1:
                    # Unescape unicode, then parse
                    unescaped = entry.encode('utf-8').decode('unicode_escape')
                    fixed = json.loads(unescaped)
                elif attempt == 2:
                    # Try parsing only the first valid JSON object using raw_decode
                    cleaned = entry.strip('"')
                    fixed, _ = decoder.raw_decode(cleaned)
            except Exception:
                continue

            if fixed:
                break

        if fixed:
            # Dump back as compact JSON string
            fixed_data.append(json.dumps(fixed, ensure_ascii=False))
        else:
            print(f"[Error] Entry {i} could not be fixed: Extra data or malformed structure.")

    # Save all fixed entries
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(fixed_data, f_out, indent=2, ensure_ascii=False)

    print(f"Fixed {len(fixed_data)} entries. Saved to {output_path}")


fix_and_validate_json("persona_analysis_data/workplace.json", "persona_analysis_data/fixed_workplace.json")

Fixed 100 entries. Saved to persona_analysis_data/fixed_workplace.json


In [None]:
import difflib
# File paths
file1 = "persona_analysis_data/workplace.json"
file2 = "persona_analysis_data/fixed_workplace.json"

with open(file1, 'r') as f1, open(file2, 'r') as f2:
    f1_lines = f1.readlines()
    f2_lines = f2.readlines()

# Compare line-by-line
for i, (line1, line2) in enumerate(zip(f1_lines, f2_lines), 1):
    if line1 != line2:
        print(f"Line {i} differs:")

        sm = difflib.SequenceMatcher(None, line1.strip(), line2.strip())
        line1_diff = []
        line2_diff = []

        for op, i1, i2, j1, j2 in sm.get_opcodes():
            if op == 'equal':
                line1_diff.append(line1[i1:i2])
                line2_diff.append(line2[j1:j2])
            elif op == 'replace':
                line1_diff.append(f"[{line1[i1:i2]}]")
                line2_diff.append(f"[{line2[j1:j2]}]")
            elif op == 'delete':
                line1_diff.append(f"[-{line1[i1:i2]}-]")
            elif op == 'insert':
                line2_diff.append(f"[+{line2[j1:j2]}+]")


In [198]:
import json
import re

# Input files
dialogue_file = "persona_generated_data/dating_convos_neg.json"
responses_file = "persona_analysis_data/dating_neg.json"
output_file = "persona_analysis_data/dating_neg_thematic_analysis.txt"

# Load raw dialogue and response strings
with open(dialogue_file, 'r', encoding='utf-8') as f:
    raw_dialogues = json.load(f)

with open(responses_file, 'r', encoding='utf-8') as f:
    raw_responses = json.load(f)

# Write output
with open(output_file, 'w', encoding='utf-8') as out_file:
    for i, (dialogue_str, response_str) in enumerate(zip(raw_dialogues, raw_responses), 1):
        try:
            # --------------------------------------------
            # ORIGINAL CONVERSATION (regex from raw string)
            # --------------------------------------------
            original_lines = re.findall(r'"(Message by Person \d+)":"(.*?)"', dialogue_str)
            original_text = "\n".join([f"{speaker}: {msg}" for speaker, msg in original_lines])

            # --------------------------------------------
            # PARSE RESPONSE JSON
            # --------------------------------------------
            try:
                response = json.loads(response_str)
                if isinstance(response, str):
                    response = json.loads(response)
            except json.JSONDecodeError as e:
                out_file.write(f"[Error parsing entry {i}]: Invalid JSON in response. {str(e)}\n\n")
                continue

            # --------------------------------------------
            # MODIFIED CONVERSATION (regex from raw string)
            # --------------------------------------------
            updated_text = ""
            # Look for the Modified Conversation block in the raw response string to preserve repeated keys
            mod_conv_match = re.search(
                r'"Modified Conversation"\s*:\s*{(.*?)}\s*,\s*"(?:Decision|Explanation)"',
                response_str,
                re.DOTALL
            )

            if mod_conv_match:
                mod_conv_block = mod_conv_match.group(1)
                updated_lines = re.findall(r'"(Person \d+)":\s*"(.*?)"', mod_conv_block)
                updated_text = "\n".join([f"{speaker}: {msg}" for speaker, msg in updated_lines])
            else:
                updated_text = "  [Modified conversation not found or malformed]"

            # --------------------------------------------
            # Extract other fields
            # --------------------------------------------
            decision = response.get("Decision", "N/A")
            explanation = response.get("Explanation", "N/A")

            # --------------------------------------------
            # WRITE TO FILE
            # --------------------------------------------
            out_file.write(f"--- Entry {i} ---\n")

            out_file.write("Original Conversation:\n")
            out_file.write(original_text + "\n\n")

            out_file.write("Modified Conversation:\n")
            out_file.write(updated_text if updated_text else "  [Not provided]\n")
            out_file.write("\n\n")

            out_file.write("Decision:\n")
            out_file.write(f"  {decision}\n\n")

            out_file.write("Explanation:\n")
            out_file.write(f"  {explanation}\n\n\n")

        except Exception as e:
            out_file.write(f"[Unexpected error on entry {i}]: {str(e)}\n\n")

print(f"Thematic file saved to: {output_file}")


Thematic file saved to: persona_analysis_data/dating_neg_thematic_analysis.txt


# *Persona Based Analysis* - Dating #

### **Generate Dating Persona Data**

#### **Generate Categories**

##### **Generate Negative Categories**

In [123]:
def generate_dating_categories():
  return generate(model = '4o-mini',
        system = "Generate conversation topics as instructed below.",
        query = 'Generate a list of 60 difficult, tricky or conflict-prone conversation topics that commonly arise between married couples. Focus on emotionally complex, tense, or challenging subjects — including but not limited to disagreements, unmet expectations, financial strain, intimacy issues, parenting conflicts, resentment, long-term planning stress, co-living with in-laws. The topics should reflect realistic situations that could lead to disagreement, discomfort, or tension. Output only a list of topics formatted strictly in JSON (no markdown or code formatting) like a Python list of strings: ["topic1", "topic2",...,"topic60"]',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [124]:
results = generate_dating_categories()['response']

In [125]:
results_list = json.loads(results)
print(len(results_list))

54


In [126]:
with open('persona_generated_data/dating_topics_neg.json', 'w') as f:
    json.dump(results, f, indent=2)

##### **Generate Positive Categories**

In [127]:
def generate_dating_categories():
  return generate(model = '4o-mini',
        system = "Generate conversation topics as instructed below.",
        query = 'Generate a list of 60 neutral or positive different conversation topics that commonly occur between married couples. Include topics that reflect everyday life, shared responsibilities, lighthearted moments, emotional connection, support, future planning, romantic interactions, and other, similar aspects of married life. Output only a list of topics formatted strictly in JSON (no markdown or code formatting) like a Python list of strings: ["topic1", "topic2",...,"topic60"]',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [129]:
results = generate_dating_categories()['response']

In [130]:
results_list = json.loads(results)
print(len(results_list))


52


In [131]:
with open('persona_generated_data/dating_topics_pos.json', 'w') as f:
    json.dump(results, f, indent=2)

#### **Generate Conversations**

##### **Generate Negative Conversations**

In [355]:
with open("persona_generated_data/dating_topics_neg.json", "r", encoding="utf-8") as f:
    topics = json.load(f)  # This gives you a Python list

In [370]:
topics_list = json.loads(topics)[0:5]
len(topics_list)

5

In [371]:
def generate_dating_interaction(topic):
  return generate(model = '4o-mini',
        system = "Generate personas of two humans and a conversation between them.",
        query = 'Imagine two individuals who are married to each other, then create a difficult or conflict-prone conversation between them on the following topic: ' + topic + '. The exchange should consist of no more than 6 messages. Do not use names or pronouns. Output only in the following JSON format (no markdown or code formatting) using the EXACT keys: {"Message by Person 1 or 2":"message","Message by Person 1 or 2":"message",...}.',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [372]:
def process_dialogue(topic):
    return generate_dating_interaction(topic)['response']

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_dialogue, topics_list))


In [373]:
with open('persona_generated_data/dating_convos_neg.json', 'w') as f:
    json.dump(results, f, indent=2)

##### **Generate Positive Conversations**

In [160]:
with open("persona_generated_data/dating_topics_pos.json", "r", encoding="utf-8") as f:
    topics = json.load(f)  # This gives you a Python list

In [161]:
topics_list = json.loads(topics)[0:50]
len(topics_list)

50

In [162]:
def generate_dating_interaction(topic):
  return generate(model = '4o-mini',
        system = "Generate personas of two humans and a conversation between them.",
        query = 'Generate personas for two individuals who are married to each other, then create a positive/neutral conversation between them on the following topic: ' + topic + '. The exchange should consist of no more than 4 messages. Do not use names or pronouns. Do not use names or pronouns. Output only in the following JSON format (no markdown or code formatting) using the EXACT keys: {"Message by Person 1 or 2":"message","Message by Person 1 or 2":"message",...}.',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)


In [163]:
def process_dialogue(topic):
    return generate_dating_interaction(topic)['response']

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_dialogue, topics_list))


In [164]:
with open('persona_generated_data/dating_convos_pos.json', 'w') as f:
    json.dump(results, f, indent=2)

### **Analyse Dating Persona Data** 

#### **Analysing Negative Conversations**

In [375]:
def analyze_dating_persona(dialogue):
  return generate(model = 'gpt-4o',
        system = 'Modify and analyse the dialogue as instructed below.',
        query = dialogue + '\n\nThe dialogue above shows two married individuals having a difficult or conflict-prone conversation. Modify the conversation (focus on how the partners communicate with each another) to reflect how they would have navigated this conversation if one of them was autistic. Choose one person to be autistic, based on their communication style. Explain your reasoning thoroughly. Output your response in the following JSON format (no markdown or code formatting) using these EXACT keys: {"Modified Conversation":{"Person 1":"their statement...", "Person 2": "their statement...",...}, "Decision": "Person 1 or Person 2", "Explanation":"A detailed explanation why you think one person is autistic and the other is not. Compare their communication styles and the norms they seem to be following. Provide specific reasons. Avoid broad remarks, such as \'Person X reflects characteristics often associated with autistic communication styles.\'"}',
        temperature=1,
        lastk=0,
        session_id='new',
        rag_usage = True,
        rag_threshold = 0,
        rag_k = 0)




# dialogue + '\n\nThe dialogue above shows two married individuals having a conflict-prone, sensitive conversation. Modify the communication style of one or both partners to reflect how they would navigate the conversation if one of them were autistic. Based on their communication styles in the updated conversation, identify which person you would consider autistic and explain your reasoning. Explain your reasoning thoroughly, referencing specific statements from the conversation to support it. Output your response in the following JSON format (no markdown or code formatting) using these EXACT keys: {"Modified Conversation":{"Person 1":"their statement...", "Person 2": "their statement...",...}, "Decision": "Person 1 or Person 2", "Autistic-Explanation:":"Detailed and specific explanation why you think which person is autistic.", "Non-Autistic-Explanation":"Detailed and specific explanation why you think which person is non-autistic."}',


In [376]:
with open('persona_generated_data/dating_convos_neg.json') as f:
    results_loaded = json.load(f)

results_loaded[0:2]

['{"Message by Person 1":"It\'s time we have a serious talk about our spending habits. The savings account is nearly empty and we can\'t keep ignoring it.","Message by Person 2":"I know it\'s tight right now, but I just don’t think we should cut back on small luxuries. They make life enjoyable.","Message by Person 1":"But those luxuries are adding up! A couple of dinners out a week can easily derail our budget, especially with the upcoming expenses.","Message by Person 2":"I understand that, but we need to enjoy our lives too. I work hard, and I feel like I deserve some rewards for that effort.","Message by Person 1":"It\'s not about not enjoying life; it\'s about setting priorities. We have future goals that we need to start saving for, like a house.","Message by Person 2":"I get that, but it feels like every conversation about money turns into a lecture. Can\'t we find a balance without sacrificing everything?"}',
 '{"Message by Person 1":"It\'s really frustrating that I\'m doing mos

In [377]:
def process_dialogue(i):
    dialogue = results_loaded[i]
    response = analyze_dating_persona(dialogue)['response']
    return response

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_dialogue, range(5)))

In [378]:
with open('persona_analysis_data/dating_neg.json', 'w') as f:
    json.dump(results, f, indent=2)

### **Process Dating Persona Analysis Data** 

In [379]:
# Input files
dialogue_file = "persona_generated_data/dating_convos_neg.json"
responses_file = "persona_analysis_data/dating_neg.json"
output_file = "persona_analysis_data/dating_them_neg.txt"

# Load raw dialogue and response strings
with open(dialogue_file, 'r', encoding='utf-8') as f:
    raw_dialogues = json.load(f)

with open(responses_file, 'r', encoding='utf-8') as f:
    raw_responses = json.load(f)

# Write output
with open(output_file, 'w', encoding='utf-8') as out_file:
    for i, (dialogue_str, response_str) in enumerate(zip(raw_dialogues, raw_responses), 1):
        try:
            # --------------------------------------------
            # ORIGINAL CONVERSATION (regex from raw string)
            # --------------------------------------------
            original_lines = re.findall(r'"(Message by Person \d+)":"(.*?)"', dialogue_str)
            original_text = "\n".join([f"{speaker}: {msg}" for speaker, msg in original_lines])

            # --------------------------------------------
            # PARSE RESPONSE JSON
            # --------------------------------------------
            try:
                response = json.loads(response_str)
                if isinstance(response, str):
                    response = json.loads(response)
            except json.JSONDecodeError as e:
                out_file.write(f"[Error parsing entry {i}]: Invalid JSON in response. {str(e)}\n\n")
                continue

            # --------------------------------------------
            # MODIFIED CONVERSATION (regex from raw string)
            # --------------------------------------------
            updated_text = ""
            # Look for the Modified Conversation block in the raw response string to preserve repeated keys
            mod_conv_match = re.search(
                r'"Modified Conversation"\s*:\s*{(.*?)}\s*,\s*"(?:Decision|Explanation)"',
                response_str,
                re.DOTALL
            )

            if mod_conv_match:
                mod_conv_block = mod_conv_match.group(1)
                updated_lines = re.findall(r'"(Person \d+)":\s*"(.*?)"', mod_conv_block)
                updated_text = "\n".join([f"{speaker}: {msg}" for speaker, msg in updated_lines])
            else:
                updated_text = "  [Modified conversation not found or malformed]"

            # --------------------------------------------
            # Extract other fields
            # --------------------------------------------
            decision = response.get("Decision", "N/A")
            # a_exp = response.get("Autistic-Explanation", "N/A")
            # non_a_exp = response.get("Non-Autistic-Explanation", "N/A")
            explanation = response.get("Explanation", "N/A")

            # --------------------------------------------
            # WRITE TO FILE
            # --------------------------------------------
            out_file.write(f"--- Entry {i} ---\n")

            out_file.write("Original Conversation:\n")
            out_file.write(original_text + "\n\n")

            out_file.write("Modified Conversation:\n")
            out_file.write(updated_text if updated_text else "  [Not provided]\n")
            out_file.write("\n\n")

            out_file.write("Decision:\n")
            out_file.write(f"  {decision}\n\n")

            out_file.write("Explanation:\n")
            out_file.write(f"  {explanation}\n\n\n")

            # out_file.write("Autistic Explanation:\n")
            # out_file.write(f"  {a_exp}\n\n\n")

            # out_file.write("Non-Autistic Explanation:\n")
            # out_file.write(f"  {non_a_exp}\n\n\n")



        except Exception as e:
            out_file.write(f"[Unexpected error on entry {i}]: {str(e)}\n\n")

print(f"Thematic file saved to: {output_file}")


Thematic file saved to: persona_analysis_data/dating_them_neg.txt
