# Final LLM class Project 

Authors: Luisa, Sebastian, Jan-Felix, Nion 


## LLM Model Set-up

In [2]:
import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
import re

import os
os.chdir("/Users/luisakurth/Downloads/")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

##################################################
#LLM Set Up
##################################################
# this first part is from the course webpage
# see: https://cogsciprag.github.io/LLM-implications/materials/session3
# I don't know whether we are allowed to copy this verbatim
# but it wouldn't be very much work to implement something similar anyways
# of course, if possible, we should adapt it to more recent models

# taken from: https://cogsciprag.github.io/LLM-implications/materials/session3

# load the tokenizer & model for T5 & GPT2
tokenizer_T5 = AutoTokenizer.from_pretrained("google/flan-t5-base")
model_T5 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",
                                                 pad_token_id=tokenizer_T5.eos_token_id)

tokenizer_GPT2 = GPT2Tokenizer.from_pretrained("gpt2")
model_GPT2 = GPT2LMHeadModel.from_pretrained("gpt2",
                                             pad_token_id=tokenizer_GPT2.eos_token_id)

# convenience function for nicer output
def pretty_print(s):
    print("Output:\n" + 100 * '-')
    print(s)


def generate(prompt, model="T5"):
    if model == "T5":
        model = model_T5
        tokenizer = tokenizer_T5
    else:
        model = model_GPT2
        tokenizer = tokenizer_GPT2
    # encode context the generation is conditioned on
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    # get output with standard parameters
    sample_output = model.generate(
        input_ids,        # context to continue
        do_sample=True,   # use sampling (not beam search (see below))
        # return maximally 50 words (including the input given)
        max_length=500,
        top_k=0,          # just sample one word
        top_p=1,          # consider all options
        temperature=0.7   # soft-max temperature
    )
    return(tokenizer.decode(sample_output[0], skip_special_tokens=True))




## Experiments Set-up 

In [89]:
###########################################
#Experiment Set-up 
###########################################

def convert_exp1(text):
    """this function converts the experimental item into a prompt
    :text param: experimental item/sentence
    :output: prompt for model
    """
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = text+"?"
    return text



def process_stimuli(convert_func, exp_stimuli):
    """
    Processes a stimuli file and extracts "thinks" and "announced" prompts in two sepetate datasets.
    :param convert_func: requires function to convert the prompts as needed.
    :param exp_stimuli: requires a string of experimental txt file 
    :return: List of "thinks" and list of "announced" prompts.
    """
    thinks = []
    announced = []
    all_lines = []
    stimuli_exp1_file = open(exp_stimuli, 'r')
    # open file (from: https://osf.io/9eg34/)

    for l in stimuli_exp1_file.readlines():
        all_lines.append(l)

    for i in range(len(all_lines)//3):

        # in the file, there is always one "thinks" line followed by one "announced" line and one blank line
        line1 = all_lines.pop(0)
        # remove first part since its an identifier
        text1 = " ".join(line1.split()[1:])
        prompt1 = convert_func(text1)
        thinks.append(prompt1)

        line2 = all_lines.pop(0)
        text2 = " ".join(line2.split()[1:])
        prompt2 = convert_func(text2)
        announced.append(prompt2)

        all_lines.pop(0)

    return thinks, announced



def process_stimuli_exp4(convert_func, exp_stimuli):
    """
    Processes a stimuli file for experiment 4.
    :param convert_func: requires function to convert the prompts as needed.
    :param exp_stimuli: requires a string of experimental txt file 
    :return: List of "high", list of "mid", and list of "low" prompts.
    """
    high = []
    mid = []
    low = []
    all_lines = []
    stimuli_exp1_file = open(exp_stimuli, 'r')
    # open file (from: https://osf.io/9eg34/)

    for l in stimuli_exp1_file.readlines():
        all_lines.append(l)

    for i in range(len(all_lines)//4):

        # in the file, there is always one "high" line followed by one "mid" line, on "low" line, and one blank line
        line1 = all_lines.pop(0)
        # remove first part since its an identifier
        text1 = " ".join(line1.split()[1:])
        prompt1 = convert_func(text1)
        high.append(prompt1)

        line2 = all_lines.pop(0)
        text2 = " ".join(line2.split()[1:])
        prompt2 = convert_func(text2)
        mid.append(prompt2)

        line3 = all_lines.pop(0)
        text3 = " ".join(line3.split()[1:])
        prompt3 = convert_func(text3)
        low.append(prompt3)

        all_lines.pop(0)

    return high, mid, low



def extract_number(question, response):
    """This function defines whether the lower or higher or both or neither number of the question is present in the model response 
    """
    # this is still very rough and could be modified to reduce the risk of missing out some instances (e.g. by
    # excluding more irrelevant punctuation marks, or including number word representations)
    qs = question.replace("?", "").split()
    low = qs[-3]
    high = qs[-1]
    has_low = False
    has_high = False
    rs = response.replace(".", "").replace(",", "").split()
    for i in rs:
        if i == low:
            has_low = True
        if i == high:
            has_high = True

    if has_low and has_high:
        return "both"
    elif has_low:
        return "low"
    elif has_high:
        return "high"
    else:
        return "neither"



def get_counts(N, condition1prompts, condition2prompts, model="T5"):
    '''Counts the numbers in the model response. 
    :param N: number of participatants i.e. number of repeats for model 
    :param condition1prompts: dataset with e.g. think sentences 
    :param condition2prompts: dataset with e.g. announced sentences
    :param model: LLM model to use 
    :output: condition1_counts and condition2_counts, in the format [counts of both numbers in answer, counts of low number, counts of high, counts of neither]
    '''
    condition1_counts = [0, 0, 0, 0]
    condition2_counts = [0, 0, 0, 0]

    for p in range(N):  # Number of participants in the "think" condition
        for tq in condition1prompts:
            response = generate(tq, model)
            result = extract_number(tq, response)
            if result == "both":
                condition1_counts[0] += 1
            elif result == "low":
                condition1_counts[1] += 1
            elif result == "high":
                condition1_counts[2] += 1
            else:
                condition1_counts[3] += 1

    for p in range(N):  # Number of participants in the "announce" condition
        for aq in condition2prompts:
            response = generate(aq, model)
            result = extract_number(aq, response)
            if result == "both":
                condition2_counts[0] += 1
            elif result == "low":
                condition2_counts[1] += 1
            elif result == "high":
                condition2_counts[2] += 1
            else:
                condition2_counts[3] += 1

    return condition1_counts, condition2_counts


def get_response(N, condition1prompts, condition2prompts, name_cond1, name_cond2, model="T5"):
    '''Reports the prompt, response and response classifiction (into low, high, both, neither) of the model for N trials
    :param N: number of participatants i.e. number of repeats for model 
    :param condition1prompts: dataset with condition 1 sentences as list
    :param condition2prompts: dataset with condition 2 sentences as list 
    :param name_cond1: name of condtion that shows up in the output table 
    :param condition2prompts: name of condtion that shows up in the output table 
    :param model: LLM model to use 
    :output: dataframe condition1output and dataframe condition2output
    '''
    condition1output = pd.DataFrame(columns=["Condition", "Prompt", "Response", "Number"])
    condition2output = pd.DataFrame(columns=["Condition", "Prompt", "Response", "Number"])


    for p in range(N): 
        
        for tq in condition1prompts:
            response = generate(tq, model)
            result = extract_number(tq, response)
            row = {"Condition": name_cond1, "Prompt": tq, "Response": response, "Number": result}
            condition1output = condition1output.append(row, ignore_index=True)

        for tq in condition2prompts:
            response = generate(tq, model)
            result = extract_number(tq, response)
            row = {"Condition": name_cond2, "Prompt": tq, "Response": response, "Number": result}
            condition2output = condition2output.append(row, ignore_index=True)

    return condition1output, condition2output

def count_string_occurrences(dataframe, column_name):
    # Get the counts of each string in the specified column
    string_counts = dataframe[column_name].value_counts()

    # Create a dictionary to store the string counts
    counts = {}

    # Iterate over the unique strings and their counts
    for string, count in string_counts.items():
        counts[string] = count

    return counts



## Experiment 1

### Test results of different prompting strategies

T5 Model

In [70]:
#create dataframe for all counts results: 
results = pd.DataFrame(columns=["Both", "Low", "High", "Neither"])

In [None]:
##################
#baseline results
##################

#get the prompts for the 2 condtions in 2 lists
thinks, announced = process_stimuli(convert_exp1,"stimuli_Exp1.txt")

#get the full responses to look at 
think_response, announce_response = get_response(N=1,condition1prompts=thinks, condition2prompts= announced, name_cond1="baseline_T5_t", name_cond2="baseline_T5_a")
think_resp_counts = count_string_occurrences(think_response, "Number")
announce_resp_counts = count_string_occurrences(announce_response, "Number")

print("The T5 model in the think condition:",think_resp_counts )
print("The T5 model in the announce condition:", announce_resp_counts)


#get just counts for N = 10
think_counts, announce_counts = get_counts(N=1,condition1prompts=thinks, condition2prompts= announced)

print("The T5 model in the think condition:", think_counts)
print("The T5 model in the announce condition:", announce_counts)

# Append the new row to the Counts_Results DataFrame
results.loc["baseline_T5_t"] = think_counts
results.loc["baseline_T5_a"] = announce_counts

In [62]:
##################
#1-shot learning
#################

def convert_oneshot(text):
    """Creates prompt with 1 example before the actual task  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    example1 = "Rachel is a librarian and works with Mark. Mark thinks that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 45. Rachel is a woman from the US. Rachel is a librarian and works with Mark. Mark announced to me that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 52. "
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = example1+text+"?"
    return text

thinks, announced = process_stimuli(convert_oneshot,"stimuli_Exp1.txt")

think_counts, announce_counts = get_counts(N=1, condition1prompts=thinks,  condition2prompts=announced)
print("The T5 model in the think condition with 1 shot learning:", think_counts)
print("The T5 model in the announce condition with 1 shot learning", announce_counts)

# Append the new row to the Results DataFrame
results.loc["1shot_T5_t"] = think_counts
results.loc["1shot_T5_a"] = announce_counts

The T5 model in the think condition with 1 shot learning: [0, 8, 4, 0]
The T5 model in the announce condition with 1 shot learning [0, 7, 5, 0]


In [8]:

##################
#2-shot learning
#################
def convert_twoshot(text):
    """Creates prompt with 2 example before the actual task  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    example1 = "Rachel is a librarian and works with Mark. Mark thinks that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 45. Rachel is a woman from the US. Rachel is a librarian and works with Mark. Mark announced to me that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 52."
    example2 = "Alex is a person from the US. Alex works at a cafe with Emma. Emma thinks that Alex drank ___ cups of coffee yesterday. Choose the more likely number: 3 or 5. The answer is 3. Alex is a person from the US. Alex works at a cafe with Emma. Emma announced to me that Alex drank ___ cups of coffee yesterday. Choose the more likely number:  3 or 5. The answer is 5."
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = example1+example2+text+"?"
    return text

thinks, announced = process_stimuli(convert_twoshot, "stimuli_Exp1.txt")
think_counts, announce_counts = get_counts(N=5, condition1prompts=thinks, condition2prompts=announced)
print("The T5 model in the think condition with 2 shot learning:", think_counts)
print("The T5 model in the announce condition with 2 shot learning:", announce_counts)

# Append the new row to the Results DataFrame
results.loc["2shot_T5_t"] = think_counts
results.loc["2shot_T5_a"] = announce_counts


##################
#3-shot learning
#################
def convert_threeshot(text):
    """Creates prompt with 3 example before the actual task  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    example1 = "Rachel is a librarian and works with Mark. Mark thinks that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 45. Rachel is a woman from the US. Rachel is a librarian and works with Mark. Mark announced to me that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 52."
    example2 = "Alex is a person from the US. Alex works at a cafe with Emma. Emma thinks that Alex drank ___ cups of coffee yesterday. Choose the more likely number: 3 or 5. The answer is 3. Alex is a person from the US. Alex works at a cafe with Emma. Emma announced to me that Alex drank ___ cups of coffee yesterday. Choose the more likely number:  3 or 5. The answer is 5."
    example3 = "Michael is a man from the US. Michael lives in a hot climate. He thinks that the temperature reached ___ degrees Celsius yesterday.Choose the more likely number: 34 or 40. The answer is 34. Michael is a man from the US. Michael lives in a hot climate. He announced to me that the temperature reached ___ degrees Celsius yesterday. Choose the more likely number:  34 or 40. The answer is 40."
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = example1+example2+example3+text+"?"
    return text

thinks, announced = process_stimuli(convert_threeshot, "stimuli_Exp1.txt")

think_counts, announce_counts = get_counts(N=5, condition1prompts=thinks, condition2prompts=announced)
print("The T5 model in the think condition with 3 shot learning:", think_counts)
print("The T5 model in the announce condition with 3 shot learning:", announce_counts)

# Append the new row to the Results DataFrame
results.loc["3shot_T5_t"] = think_counts
results.loc["3shot_T5_a"] = announce_counts



The T5 model in the think condition with 1 shot learning: [0, 39, 19, 2]
The T5 model in the announce condition with 1 shot learning [0, 36, 23, 1]
The T5 model in the think condition with 2 shot learning: [0, 38, 20, 2]
The T5 model in the announce condition with 2 shot learning: [0, 37, 23, 0]
The T5 model in the think condition with 3 shot learning: [0, 37, 20, 3]
The T5 model in the announce condition with 3 shot learning: [0, 36, 24, 0]


In [54]:
##################
#1-shot-extreme learning
#################

def convert_oneshot_extreme(text):
    """Creates prompt with 1 example before the actual task  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    example_e = "Rachel is a librarian and works with Mark. Mark thinks that Rachel read ___ books last month. Choose the more likely number: 11 or 102. The answer is 11. Rachel is a woman from the US. Rachel is a librarian and works with Mark. Mark announced to me that Rachel read ___ books last month. Choose the more likely number: 11 or 102. The answer is 102. "
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = example_e+text+"?"
    return text

thinks, announced = process_stimuli(convert_oneshot_extreme, "stimuli_Exp1.txt")

think_counts, announce_counts = get_counts(N=1, condition1prompts=thinks, condition2prompts=announced)
print("The T5 model in the think condition with 1 extreme shot learning:", think_counts)
print("The T5 model in the announce condition with 1 extreme shot learning", announce_counts)

# Append the new row to the Results DataFrame
results.loc["1shot_extreme_T5_t"] = think_counts
results.loc["1shot_extreme_T5_a"] = announce_counts

The T5 model in the think condition with 1 extreme shot learning: [0, 8, 3, 1]
The T5 model in the announce condition with 1 extreme shot learning [0, 8, 4, 0]


In [10]:
###############################
#knowledge generation learning
###############################

#https://www.promptingguide.ai/techniques/knowledge

def convert_know(text):
    """Creates prompt with a infomration before the question  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    knowledge = "Knowledge: the lower number is just slighlty above the mean and the larger number is a lot above the mean."
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = knowledge+text+"?"
    return text

thinks, announced = process_stimuli(convert_know, "stimuli_Exp1.txt")

think_counts, announce_counts = get_counts(N=5, condition1prompts=thinks, condition2prompts=announced)
print("The T5 model in the think condition with knowledge prompting :", think_counts)
print("The T5 model in the announce condition with knowledge prompting:", announce_counts)

# Append the new row to the Results DataFrame
results.loc["know_T5_t"] = think_counts
results.loc["know_T5_a"] = announce_counts

The T5 model in the think condition with knowledge prompting : [0, 28, 31, 1]
The T5 model in the announce condition with knowledge prompting: [0, 33, 26, 1]


In [11]:
print(results)

                    Both  Low  High  Neither
baseline_T5_t          0   34    21        5
baseline_T5_a          0   37    18        5
1shot_T5_t             0   39    19        2
1shot_T5_a             0   36    23        1
2shot_T5_t             0   38    20        2
2shot_T5_a             0   37    23        0
3shot_T5_t             0   37    20        3
3shot_T5_a             0   36    24        0
1shot_extreme_T5_t     0   42    14        4
1shot_extreme_T5_a     0   35    24        1
know_T5_t              0   28    31        1
know_T5_a              0   33    26        1


## Experiment 2

### Test results 

T5 Model

In [35]:
#create dataframe for all results of Exp2: 
results_exp2 = pd.DataFrame(columns=["Both", "Low", "High", "Neither"])

In [48]:
###########################################
#Baseline 
###########################################

def convert_exp2(text):
    """This function converts the experimental item into a prompt.
    :text param: experimental item/sentence
    :output: prompt for model
    """
    text = text.replace("XXX", ". Choose the more likely number:", 1)
    text = text.replace("XXX", "or", 1)
    text = text + "?"
    return text


blue, asked = process_stimuli(convert_exp2, "stimuli_Exp2.txt")
blue_counts, asked_counts = get_counts(N=2, condition1prompts=blue, condition2prompts=asked)

print("The T5 model in the out of blue condition:", blue_counts)
print("The T5 model in the when asked condition:", asked_counts)

# Append the new row to the Results DataFrame
results_exp2.loc["exp2_T5_blue"] = blue_counts
results_exp2.loc["exp2_T5_asked"] = asked_counts

The T5 model in the out of blue condition: [0, 35, 24, 9]
The T5 model in the when asked condition: [0, 45, 17, 6]


In [101]:
##################
#1-shot learning
#################

def convert_exp2_oneshot(text):
    """Creates prompt with 1 example before the actual task  
    input: experimental item 
    output: Exmaple + Experimental item Task (Choose the more likely number)
    """
    example1 = "Rachel is a librarian and works with Mark. Mark said out of the blue that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 52. Rachel is a woman from the US. Rachel is a librarian and works with Mark. Mark said, when asked about it, that Rachel read ___ books last month. Choose the more likely number: 45 or 52. The answer is 45. "
    text = text.replace(
        "XXX", ". Choose the more likely number:").replace("YYYY", "or")
    text = example1+text
    return text

blue, asked = process_stimuli(convert_exp2_oneshot, "stimuli_Exp2.txt")
blue_counts, asked_counts = get_counts(N=2, condition1prompts=blue, condition2prompts=asked)

print("The T5 model in the out of blue condition:", blue_counts)
print("The T5 model in the when asked condition:", asked_counts)

# Append the new row to the Results DataFrame
results_exp2.loc["exp2_T5_blue_oneshot"] = blue_counts
results_exp2.loc["exp2_T5_aske_onshot"] = asked_counts

The T5 model in the out of blue condition: [0, 0, 21, 47]
The T5 model in the when asked condition: [0, 0, 20, 48]


In [37]:
print(results_exp2)

               Both  Low  High  Neither
exp2_T5_blue      0   38    26        4
exp2_T5_asked     0   38    27        3


## Experiment 3

### Test results 

T5 Model

note: there is a double enter in the stimuli data (after the 24th item set) which needs to be removed, otherwise the follwing will nut work.

In [40]:
#create dataframe for all results of Exp2: 
results_exp3 = pd.DataFrame(columns=["Both", "Low", "High", "Neither"])

In [100]:
###########################################
#Baseline 
###########################################

def convert_exp3(text):
    """This function converts the experimental item into a prompt.
    :text param: experimental item/sentence
    :output: prompt for model
    """
    text = text.replace("XXX", ". Choose the more likely number:", 1)
    text = text.replace("XXX", "or", 1)
    text = text #+ "?"
    return text


everyone, me = process_stimuli(convert_exp3, "stimuli_Exp3.txt")
everyone_counts, me_counts = get_counts(N=1, condition1prompts=everyone[:], condition2prompts=me[:])

print("The T5 model in the everyone condition:", everyone_counts)
print("The T5 model in the me condition:", me_counts)

# Append the new row to the Results DataFrame
results_exp2.loc["exp3_T5_everyone"] = everyone_counts
results_exp2.loc["exp3_T5_me"] = me_counts

The T5 model in the everyone condition: [0, 22, 9, 3]
The T5 model in the me condition: [0, 21, 7, 6]
