# COMPSCI703 Project Code
- Author: Xiaolin Zhao
- Environment: Google Colab (with OpenAI API)
- Dataset: Social IQA


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import tarfile

with tarfile.open("/content/drive/MyDrive/social_iqa/socialIQa_v1.4.tgz", "r:gz") as tar:
    tar.extractall("social_iqa_data")


In [None]:
import os

os.listdir("social_iqa_data")


['socialIQa_v1.4_dev.jsonl',
 'socialIQa_v1.4_tst.jsonl',
 'socialIQa_v1.4_trn.jsonl',
 'LICENCE-CC-BY-4.txt',
 'README.md']

In [None]:
import pandas as pd

dev_df = pd.read_json("social_iqa_data/socialIQa_v1.4_dev.jsonl", lines=True)
dev_df.head()


Unnamed: 0,context,question,answerA,answerB,answerC,correct
0,Tracy didn't go home that evening and resisted...,What does Tracy need to do before this?,make a new plan,Go home and see Riley,Find somewhere to go,C
1,Sydney walked past a homeless woman asking for...,How would you describe Sydney?,sympathetic,like a person who was unable to help,incredulous,A
2,Sasha protected the patients' rights by making...,What will patients want to do next?,write new laws,get petitions signed,live longer,B
3,Jordan was in charge of taking the food on the...,How would Jordan feel afterwards?,horrible that he let his friends down on the c...,happy that he doesn't need to do the cooking o...,very proud and accomplished about the camping ...,A
4,Kendall opened their mouth to speak and what c...,How would you describe Kendall?,a very quiet person,a very passive person,a very aggressive and talkative person,C


In [None]:
def build_cot_prompt(df, n_shot=3, test_idx=5):
    # Manually prepared reasoning dictionaries
    sample_reasonings = {
        0: "Tracy didn’t go home and resisted doing so, which suggests she’s avoiding returning. Before anything else, she needs to secure a temporary place to stay. So finding somewhere to go is her immediate need.",
        1: "Sydney walked past a homeless woman asking for help, which implies a lack of emotional response or action. Someone sympathetic would likely have stopped or acknowledged the woman. Sydney’s behavior suggests she felt nothing or ignored the situation.",
        2: "Sasha protected patients' rights. People who care about rights often organize support rather than make laws directly. Getting petitions signed is a common and accessible action that aligns with this goal.",
        3: "Jordan was responsible for food and later felt something. If he failed, he might feel guilt or shame. The phrasing ‘let his friends down’ suggests disappointment. Therefore, he likely feels horrible afterward.",
        4: "Kendall opened their mouth to speak and changed their mind, suggesting hesitation and lack of assertiveness. This behavior aligns with being passive and reserved rather than aggressive or expressive."
    }

    prompt = ""

    # Add n_shot example
    for i in range(n_shot):
        row = df.iloc[i]
        reasoning = sample_reasonings.get(i, "Let's think step by step...")  # fallback
        prompt += f"""Q: {row['question']}
Context: {row['context']}
A. {row['answerA']}
B. {row['answerB']}
C. {row['answerC']}
Let's think step by step.
{reasoning}
So the answer is: {row['correct']}

"""

    # Adding test samples (without reasoning and answers)
    test_row = df.iloc[test_idx]
    prompt += f"""Q: {test_row['question']}
Context: {test_row['context']}
A. {test_row['answerA']}
B. {test_row['answerB']}
C. {test_row['answerC']}
Let's think step by step.
"""

    return prompt


In [None]:
prompt = build_cot_prompt(dev_df, n_shot=3, test_idx=5)
print(prompt)


Q: What does Tracy need to do before this?
Context: Tracy didn't go home that evening and resisted Riley's attacks.
A. make a new plan
B. Go home and see Riley
C. Find somewhere to go
Let's think step by step.
Tracy didn’t go home and resisted doing so, which suggests she’s avoiding returning. Before anything else, she needs to secure a temporary place to stay. So finding somewhere to go is her immediate need.
So the answer is: C

Q: How would you describe Sydney?
Context: Sydney walked past a homeless woman asking for change but did not have any money they could give to her. Sydney felt bad afterwards.
A. sympathetic
B. like a person who was unable to help
C. incredulous
Let's think step by step.
Sydney walked past a homeless woman asking for help, which implies a lack of emotional response or action. Someone sympathetic would likely have stopped or acknowledged the woman. Sydney’s behavior suggests she felt nothing or ignored the situation.
So the answer is: A

Q: What will patients 

In [None]:
import openai

openai.api_key = "..."  #openai api key

client = openai.OpenAI(api_key="...")

response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0  # consistency
)

output = response.choices[0].message.content

print(output)


Let's think step by step.

Cameron left the Halloween party early and drove home quickly. This suggests urgency and concern. Among the options:

- Going to bed (C) is unlikely to require rushing.
- Taking a child to the doctor (B) is urgent and would explain the quick departure.
- Getting home to a sick dog (A) could also be urgent, but generally, taking a child to the doctor is a more common urgent reason.

Therefore, the most plausible reason is B.

So the answer is: B


In [None]:
import re
from tqdm import tqdm

def extract_choice(output_text):
    match = re.search(r"So the answer is:\s*([ABC])", output_text)
    return match.group(1) if match else None

def evaluate_accuracy_batch(df, test_indices, n_shot_list):
    results = {}

    for n_shot in n_shot_list:
        correct_count = 0
        total = len(test_indices)
        predictions = []

        print(f"\nEvaluating {n_shot}-shot prompting:")
        for idx in tqdm(test_indices):
            prompt = build_cot_prompt(df, n_shot=n_shot, test_idx=idx)

            try:
                response = client.chat.completions.create(
                    model="gpt-4.1-mini",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0
                )
                output = response.choices[0].message.content
                pred = extract_choice(output)
                gold = df.loc[idx, 'correct']
                predictions.append((idx, pred, gold))

                if pred == gold:
                    correct_count += 1

            except Exception as e:
                print(f"Error at idx {idx}: {e}")
                predictions.append((idx, None, df.loc[idx, 'correct']))

        accuracy = correct_count / total
        results[n_shot] = {
            "accuracy": accuracy,
            "details": predictions
        }

    return results


In [None]:
test_indices = list(range(5, 105))  # A total of 100 samples were evaluated
n_shot_list = [1, 3, 5]

results = evaluate_accuracy_batch(dev_df, test_indices, n_shot_list)



Evaluating 1-shot prompting:


100%|██████████| 100/100 [04:41<00:00,  2.81s/it]



Evaluating 3-shot prompting:


100%|██████████| 100/100 [03:57<00:00,  2.38s/it]



Evaluating 5-shot prompting:


100%|██████████| 100/100 [04:41<00:00,  2.81s/it]


In [None]:

rows = []

for n_shot, result in results.items():
    for (idx, pred, gold) in result['details']:
        correct = int(pred == gold) if pred else 0

        prompt = build_cot_prompt(dev_df, n_shot=n_shot, test_idx=idx)
        response = client.chat.completions.create(
            model="gpt-4.1-mini",#three models
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        full_output = response.choices[0].message.content
        reasoning_length = len(full_output.split())

        rows.append({
            "index": idx,
            "n_shot": n_shot,
            "pred": pred,
            "gold": gold,
            "correct": correct,
            "reasoning_length": reasoning_length,
            "full_response": full_output
        })

df_full = pd.DataFrame(rows)
df_full.to_csv("cot_eval_full_4.1mini.csv", index=False)
