In [None]:
## Example for similarity calculation of llama results. 
## Same procedure for the other models

In [1]:
import pandas as pd 
goqa = pd.read_csv('data_merged_GOQA_llama_70B.csv', sep=';')
gpt = pd.read_csv('data_merged_gpt_llama_70B.csv', sep=';')
quiz = pd.read_csv('data_merged_quiz_llama_70B.csv', sep=';')

In [2]:
import openai
import os
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def gpt4_text_similarity(text1, text2, model="gpt-4-1106-preview"):
    """
    Measures the similarity between two texts using GPT-4.

    Returns:
    float: A similarity score between 1 (not similar) and 5(very similar).
    """

    system = f"On a scale between 1 and 5, how similar are the following two sentences? Respond only with a score between 1 and 5."
    prompt = f"""
                Examples:
                Input: 
                Sentence 1: Not really, too busy for that.
                Sentence 2: Yes, quite fond of academic journals.
                Output: 1.0
                Input: 
                Sentence 1: Universal healthcare, accessible to all residents.
                Sentence 2: NHS provides universal healthcare for all.
                Output: 2.0
                Input: 
                Sentence 1: Based on recommendations and personal interests.
                Sentence 2: By author, genre, recommendations, and reviews.
                Output: 3.0
                Input: 
                Sentence 1: As often as I can.
                Sentence 2: Quite often, I'm always willing.
                Output: 4.0
                Input: 
                Sentence 1: Yes, I love wearing hats!
                Sentence 2: Yes, I quite fancy wearing hats.
                Output: 5.0    
                \nInput:\n\nSentence 1: {text1}\n\n Sentence 2: {text2}. \n Output:"""
    #prompt = f"Rate the similarity between the following two texts on a scale from 0 (completely different) to 1 (identical):\n\nText 1: {text1}\n\nText 2: {text2}"
    response = client.chat.completions.create(
    model=model,
    messages=[
        {
        "role": "system",
        "content": system
        },
        {
        "role": "user",
        "content":  prompt
        }
    ],
    max_tokens=20
    )
    #print(response.choices[0].message.content)
    # Extracting the similarity score from the response
    try:
        last_message = response.choices[0].message.content
        similarity_score = float(last_message.strip())
    except (ValueError, KeyError, IndexError):
        similarity_score = None
    print(similarity_score)
    return similarity_score


In [3]:
def normalize_column(df, column_name, new_column_name):
    """
    Normalize the values in a DataFrame column to the range 0-1 and save them in a new column.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column to normalize.
    column_name (str): The name of the column to normalize.
    new_column_name (str): The name of the new column for the normalized values.
    
    Returns:
    pandas.DataFrame: A DataFrame with the additional normalized column.
    """
    # Copy the DataFrame to avoid modifying the original data
    df_normalized = df.copy()
    
    # Apply Min-Max normalization
    df_normalized[new_column_name] = (df_normalized[column_name] - 1) / (5 - 1)
    
    return df_normalized

In [4]:
def add_small_value(df, column_name):
    """
    Add 0.00001 to each value in the specified column of a DataFrame 
    if the value is not 0.0 or 1.0.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to modify.
    
    Returns:
    pandas.DataFrame: A DataFrame with the modified column.
    """
    # Define the lambda function for the condition
    add_value = lambda x: x + 0.000001 if x not in [0.0, 1.0] else x

    # Apply the function to the specified column
    df[column_name] = df[column_name].apply(add_value)
    
    return df

In [None]:
print("uk_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_uk'], row['answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['uk_score'] = gpt.apply(apply_similarity, axis=1)
print("uk_score Finished")

print("us_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['us_score'] = gpt.apply(apply_similarity, axis=1)

print("us_score finished")

print("ukGT_usGT_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['ukGT_usGT_score'] = gpt.apply(apply_similarity, axis=1)

print("ukGT_usGT_score finished")

print("ukGT_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['model_answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['ukGT_usMA_score'] = gpt.apply(apply_similarity, axis=1)
print("ukGT_usMA_score finished")

print("ukMA_usGT_score start")

def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['ukMA_usGT_score'] = gpt.apply(apply_similarity, axis=1)
print("ukMA_usGT_score finished")

print("ukMA_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

gpt['ukMA_usMA_score'] = gpt.apply(apply_similarity, axis=1)
print("ukMA_usMA_score finished")


In [7]:
gpt.to_csv('results_gpt.csv', sep=';')

In [None]:
print("uk_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_uk'], row['answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['uk_score'] = quiz.apply(apply_similarity, axis=1)
print("uk_score Finished")

print("us_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['us_score'] = quiz.apply(apply_similarity, axis=1)

print("us_score finished")

print("ukGT_usGT_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['ukGT_usGT_score'] = quiz.apply(apply_similarity, axis=1)

print("ukGT_usGT_score finished")

print("ukGT_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['model_answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['ukGT_usMA_score'] = quiz.apply(apply_similarity, axis=1)
print("ukGT_usMA_score finished")

print("ukMA_usGT_score start")

def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['ukMA_usGT_score'] = quiz.apply(apply_similarity, axis=1)
print("ukMA_usGT_score finished")

print("ukMA_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

quiz['ukMA_usMA_score'] = quiz.apply(apply_similarity, axis=1)
print("ukMA_usMA_score finished")


In [9]:
quiz.to_csv('results_quiz.csv', sep=';')

In [10]:
quiz = normalize_column(quiz, 'us_score', 'us_score')
quiz = normalize_column(quiz, 'uk_score', 'uk_score')
quiz = normalize_column(quiz, 'ukGT_usGT_score', 'ukGT_usGT_score')
quiz = normalize_column(quiz, 'ukMA_usGT_score', 'ukMA_usGT_score')
quiz = normalize_column(quiz, 'ukGT_usMA_score', 'ukGT_usMA_score')
quiz = normalize_column(quiz, 'ukMA_usMA_score', 'ukMA_usMA_score')
quiz = add_small_value(quiz, 'us_score')
quiz = add_small_value(quiz, 'uk_score')
quiz = add_small_value(quiz, 'ukGT_usGT_score')
quiz = add_small_value(quiz, 'ukMA_usGT_score')
quiz = add_small_value(quiz, 'ukGT_usMA_score')
quiz = add_small_value(quiz, 'ukMA_usMA_score')

In [11]:
gpt = normalize_column(gpt, 'us_score', 'us_score')
gpt = normalize_column(gpt, 'uk_score', 'uk_score')
gpt = normalize_column(gpt, 'ukGT_usGT_score', 'ukGT_usGT_score')
gpt = normalize_column(gpt, 'ukMA_usGT_score', 'ukMA_usGT_score')
gpt = normalize_column(gpt, 'ukGT_usMA_score', 'ukGT_usMA_score')
gpt = normalize_column(gpt, 'ukMA_usMA_score', 'ukMA_usMA_score')
gpt = add_small_value(gpt, 'us_score')
gpt = add_small_value(gpt, 'uk_score')
gpt = add_small_value(gpt, 'ukGT_usGT_score')
gpt = add_small_value(gpt, 'ukMA_usGT_score')
gpt = add_small_value(gpt, 'ukGT_usMA_score')
gpt = add_small_value(gpt, 'ukMA_usMA_score')

In [14]:
import re 
def extract_letter(s):
    match = re.search(r'\((.*?)\)', s)
    return match.group(1) if match else None

# Apply the function to the DataFrame column
goqa['model_answer_us'] = goqa['model_answer_us'].apply(extract_letter)
goqa['model_answer_uk'] = goqa['model_answer_uk'].apply(extract_letter)

In [17]:
import ast 
def extract_value_from_dict(df, dict_col, key_col, value_col):
    # Convert string representations of dictionaries to actual dictionaries if needed
    df[dict_col] = df[dict_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Function to get value from the dictionary based on the key
    def get_value(row):
        dict_data = row[dict_col]
        key = row[key_col]
        # Check if dict_data is a dictionary and key is not None
        if isinstance(dict_data, dict) and key is not None:
            return dict_data.get(key, None)  # Returns None if the key is not found
        else:
            return None

    # Apply the function to each row and store the result in a new column
    df[value_col] = df.apply(get_value, axis=1)

    return df

# Example usage:
# Assume df is your DataFrame, 'dict_column' is the name of the column with dictionaries,
# 'key_column' is the name of the column with keys, and you want to store values in 'value_column'
goqa = extract_value_from_dict(goqa, 'options_dict', 'model_answer_us', 'model_answer_us_option_match')
goqa = extract_value_from_dict(goqa, 'options_dict', 'model_answer_uk', 'model_answer_uk_option_match')


In [35]:
goqa['model_answer_us_option_match'] = goqa['model_answer_us_option_match'].astype(str)
goqa['model_answer_uk_option_match'] = goqa['model_answer_uk_option_match'].astype(str)

In [47]:
# Score for questions with answer options
def calculate_score_gt(row):
    if row['question type'] in ['Likert Scale', 'Numerical Scale', 'Ordinal Scale'] and row['#_options'] > 2:
 
        ground_truth = row['model_answer_uk_option_match']
        model_answer = row['answer_us']
        options = row['options']
        #print(type(ground_truth))
        #print(type(model_answer))
        # Normalize the positions of the answers in the options list to a 0-1 range
        gt_index = options.index(ground_truth) / (len(options) - 1)
        model_index = options.index(model_answer) / (len(options) - 1)

        # Calculate the absolute error
        error = abs(gt_index - model_index)

        # Score can be inversely related to the error (1 - error)
        score = 1 - error
        return score
    else: 
        return int(row['model_answer_uk_option_match'] == row['answer_us'])


In [48]:
goqa['ukMA_usGT_score'] = goqa.apply(calculate_score_gt, axis=1)

In [49]:
goqa = add_small_value(goqa, 'us_score')
goqa = add_small_value(goqa, 'uk_score')
goqa = add_small_value(goqa, 'ukGT_usGT_score')
goqa = add_small_value(goqa, 'ukMA_usGT_score')
goqa = add_small_value(goqa, 'ukGT_usMA_score')
goqa = add_small_value(goqa, 'ukMA_usMA_score')

In [50]:
goqa.to_csv('results_goqa.csv', sep=';')