In [1]:
import pandas as pd
df = pd.read_csv('data_merged_updated_similarity_zero_shot.csv', sep=';')


In [2]:
goqa = df[(df['source'] == 'GAS') | (df['source'] == 'WVS')]

In [3]:
data = df[(df['source'] == 'GPT-4 generated') |(df['source'] == 'GPT / https://www.beelovedcity.com/england-quiz')
            |(df['source'] == 'GPT / https://www.anadventurousworld.com/usa-trivia-questions/')]


In [12]:
import openai
import os
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def gpt4_text_similarity(text1, text2, model="gpt-4-1106-preview"):
    """
    Measures the similarity between two texts using GPT-4.

    Parameters:
    text1 (str): First text for comparison.
    text2 (str): Second text for comparison.
    model (str): The GPT model to use.

    Returns:
    float: A similarity score between 0 (not similar) and 1 (very similar).
    """

    system = f"""Evaluate the similarity between the two sentences. Assign the pair a score between 1 and 5 as follows:
                1 : The two sentences are completely dissimilar.
                2 : The two sentences are dissimilar, but are on a similar topic.
                3 : The two sentences are roughly equivalent, but some important information differs or is missing. 
                4 : The two sentences are mostly equivalent, but some unimportant details differ.
                5 : The two sentences are completely equivalent.
                Respond only with a score between 1 and 5."""
    prompt = f"\nInput:\n\nSentence 1: {text1}\n\n Sentence 2: {text2}. \n Output:"
    #prompt = f"Rate the similarity between the following two texts on a scale from 0 (completely different) to 1 (identical):\n\nText 1: {text1}\n\nText 2: {text2}"
    response = client.chat.completions.create(
    model=model,
    messages=[
        {
        "role": "system",
        "content": system
        },
        {
        "role": "user",
        "content":  prompt
        }
    ],
    max_tokens=20
    )
    #print(response.choices[0].message.content)
    # Extracting the similarity score from the response
    try:
        last_message = response.choices[0].message.content
        similarity_score = float(last_message.strip())
    except (ValueError, KeyError, IndexError):
        similarity_score = None
    print(similarity_score)
    return similarity_score


In [9]:
def normalize_column(df, column_name, new_column_name):
    """
    Normalize the values in a DataFrame column to the range 0-1 and save them in a new column.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column to normalize.
    column_name (str): The name of the column to normalize.
    new_column_name (str): The name of the new column for the normalized values.
    
    Returns:
    pandas.DataFrame: A DataFrame with the additional normalized column.
    """
    # Copy the DataFrame to avoid modifying the original data
    df_normalized = df.copy()
    
    # Apply Min-Max normalization
    df_normalized[new_column_name] = (df_normalized[column_name] - 1) / (5 - 1)
    
    return df_normalized

In [10]:
def add_small_value(df, column_name):
    """
    Add 0.00001 to each value in the specified column of a DataFrame 
    if the value is not 0.0 or 1.0.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to modify.
    
    Returns:
    pandas.DataFrame: A DataFrame with the modified column.
    """
    # Define the lambda function for the condition
    add_value = lambda x: x + 0.000001 if x not in [0.0, 1.0] else x

    # Apply the function to the specified column
    df[column_name] = df[column_name].apply(add_value)
    
    return df

In [13]:
print("uk_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_uk'], row['answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['uk_score'] = data.apply(apply_similarity, axis=1)
print("uk_score Finished")

print("us_score Start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['us_score'] = data.apply(apply_similarity, axis=1)

print("us_score finished")

print("ukGT_usGT_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['ukGT_usGT_score'] = data.apply(apply_similarity, axis=1)
"""
print("ukGT_usGT_score finished")

print("ukGT_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_uk'], row['model_answer_us'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['ukGT_usMA_score'] = data.apply(apply_similarity, axis=1)
print("ukGT_usMA_score finished")

print("ukMA_usGT_score start")

def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['ukMA_usGT_score'] = data.apply(apply_similarity, axis=1)
print("ukMA_usGT_score finished")

print("ukMA_usMA_score start")
def apply_similarity(row):
    try:
        return gpt4_text_similarity(row['model_answer_us'], row['model_answer_uk'])
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

data['ukMA_usMA_score'] = data.apply(apply_similarity, axis=1)
print("ukMA_usMA_score finished")
"""

uk_score Start
5.0
5.0
5.0
2.0
5.0
5.0
2.0
2.0
5.0
2.0
3.0
4.0
5.0
4.0
4.0
5.0
5.0
4.0
5.0
4.0
2.0
2.0
2.0
1.0
3.0
5.0
2.0
4.0
1.0
2.0
5.0
5.0
1.0
4.0
5.0
5.0
5.0
1.0
2.0
5.0
3.0
2.0
5.0
2.0
5.0
1.0
2.0
5.0
2.0
2.0
1.0
2.0
2.0
2.0
4.0
1.0
2.0
2.0
1.0
2.0
2.0
3.0
2.0
2.0
2.0
1.0
2.0
2.0
5.0
5.0
3.0
4.0
1.0
2.0
4.0
2.0
4.0
5.0
1.0
4.0
4.0
4.0
2.0
2.0
2.0
5.0
2.0
1.0
5.0
2.0
2.0
5.0
5.0
3.0
2.0
4.0
5.0
2.0
2.0
5.0
1.0
2.0
5.0
1.0
5.0
5.0
3.0
2.0
2.0
2.0
5.0
5.0
2.0
2.0
1.0
1.0
2.0
4.0
2.0
2.0
2.0
3.0
2.0
5.0
5.0
5.0
5.0
1.0
2.0
4.0
5.0
2.0
1.0
5.0
2.0
4.0
5.0
2.0
1.0
1.0
2.0
2.0
2.0
4.0
1.0
1.0
1.0
4.0
3.0
2.0
1.0
2.0
3.0
4.0
3.0
4.0
4.0
2.0
5.0
3.0
2.0
4.0
4.0
4.0
2.0
3.0
4.0
2.0
1.0
2.0
4.0
2.0
2.0
1.0
3.0
4.0
2.0
2.0
2.0
4.0
2.0
5.0
4.0
2.0
2.0
1.0
3.0
1.0
2.0
5.0
2.0
5.0
2.0
3.0
5.0
3.0
1.0
1.0
1.0
4.0
3.0
4.0
1.0
2.0
1.0
2.0
1.0
2.0
2.0
4.0
1.0
1.0
4.0
2.0
2.0
2.0
1.0
3.0
1.0
3.0
3.0
1.0
4.0
4.0
4.0
2.0
1.0
2.0
4.0
1.0
3.0
5.0
1.0
3.0
1.0
3.0
1.0
2.0
2.0
2.0
2.0
1.0
2.0
1.0
1.0
2.0
2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uk_score'] = data.apply(apply_similarity, axis=1)


2.0
1.0
5.0
2.0
5.0
5.0
3.0
5.0
5.0
3.0
3.0
5.0
2.0
4.0
5.0
5.0
5.0
5.0
2.0
5.0
5.0
3.0
2.0
5.0
2.0
5.0
2.0
5.0
5.0
2.0
5.0
1.0
5.0
2.0
5.0
4.0
2.0
5.0
4.0
5.0
2.0
4.0
5.0
3.0
5.0
2.0
5.0
5.0
3.0
1.0
4.0
2.0
2.0
2.0
5.0
5.0
2.0
2.0
2.0
3.0
3.0
2.0
4.0
2.0
2.0
4.0
2.0
2.0
2.0
1.0
2.0
5.0
1.0
2.0
4.0
2.0
5.0
5.0
2.0
1.0
5.0
1.0
2.0
2.0
1.0
3.0
2.0
2.0
5.0
2.0
3.0
5.0
5.0
2.0
5.0
5.0
2.0
2.0
2.0
2.0
1.0
2.0
1.0
1.0
5.0
2.0
5.0
2.0
2.0
5.0
5.0
5.0
1.0
2.0
2.0
1.0
2.0
1.0
2.0
2.0
2.0
5.0
5.0
1.0
2.0
5.0
5.0
4.0
5.0
1.0
4.0
2.0
1.0
4.0
2.0
2.0
5.0
5.0
2.0
2.0
1.0
4.0
5.0
4.0
1.0
2.0
1.0
5.0
5.0
5.0
4.0
2.0
2.0
1.0
2.0
4.0
1.0
1.0
3.0
3.0
1.0
4.0
3.0
3.0
5.0
3.0
4.0
2.0
1.0
4.0
2.0
2.0
5.0
4.0
4.0
3.0
2.0
2.0
5.0
1.0
2.0
5.0
1.0
3.0
2.0
1.0
5.0
5.0
3.0
2.0
2.0
5.0
2.0
3.0
5.0
2.0
2.0
5.0
1.0
1.0
2.0
5.0
2.0
4.0
4.0
5.0
3.0
4.0
5.0
5.0
5.0
1.0
3.0
1.0
1.0
2.0
1.0
1.0
2.0
1.0
1.0
1.0
4.0
2.0
4.0
1.0
3.0
4.0
4.0
1.0
2.0
5.0
4.0
3.0
5.0
1.0
5.0
3.0
1.0
2.0
5.0
2.0
5.0
1.0
5.0
2.0
5.0
5.0
3.0
2.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['us_score'] = data.apply(apply_similarity, axis=1)


2.0
2.0
5.0
4.0
4.0
5.0
2.0
2.0
1.0
2.0
2.0
2.0
5.0
5.0
4.0
2.0
2.0
4.0
2.0
3.0
2.0
2.0
5.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
3.0
2.0
1.0
2.0
5.0
4.0
4.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
3.0
5.0
2.0
2.0
2.0
3.0
2.0
2.0
1.0
1.0
2.0
1.0
2.0
1.0
2.0
1.0
2.0
1.0
1.0
2.0
3.0
2.0
2.0
2.0
2.0
1.0
2.0
1.0
2.0
2.0
2.0
4.0
2.0
1.0
4.0
2.0
2.0
2.0
1.0
2.0
1.0
1.0
5.0
2.0
2.0
1.0
1.0
2.0
5.0
2.0
5.0
2.0
3.0
2.0
2.0
4.0
1.0
1.0
1.0
2.0
1.0
3.0
2.0
1.0
2.0
1.0
2.0
2.0
4.0
2.0
1.0
4.0
5.0
2.0
2.0
2.0
4.0
3.0
2.0
5.0
1.0
1.0
2.0
1.0
2.0
2.0
1.0
2.0
3.0
2.0
2.0
2.0
4.0
1.0
3.0
1.0
1.0
4.0
2.0
4.0
5.0
2.0
1.0
2.0
3.0
3.0
2.0
4.0
4.0
2.0
2.0
4.0
3.0
3.0
4.0
2.0
3.0
2.0
3.0
2.0
1.0
5.0
2.0
5.0
2.0
2.0
2.0
5.0
4.0
3.0
5.0
1.0
4.0
5.0
3.0
2.0
2.0
5.0
2.0
2.0
5.0
2.0
1.0
2.0
2.0
1.0
1.0
2.0
2.0
4.0
4.0
3.0
1.0
2.0
1.0
2.0
1.0
2.0
3.0
1.0
1.0
3.0
3.0
2.0
3.0
1.0
1.0
1.0
4.0
2.0
1.0
2.0
2.0
1.0
4.0
4.0
3.0
5.0
1.0
4.0
1.0
1.0
1.0
3.0
1.0
1.0
3.0
2.0
5.0
3.0
4.0
3.0
2.0
5.0
3.0
2.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ukGT_usGT_score'] = data.apply(apply_similarity, axis=1)


'\nprint("ukGT_usGT_score finished")\n\nprint("ukGT_usMA_score start")\ndef apply_similarity(row):\n    try:\n        return gpt4_text_similarity(row[\'answer_uk\'], row[\'model_answer_us\'])\n    except Exception as e:\n        print(f"Error processing row: {e}")\n        return None\n\ndata[\'ukGT_usMA_score\'] = data.apply(apply_similarity, axis=1)\nprint("ukGT_usMA_score finished")\n\nprint("ukMA_usGT_score start")\n\ndef apply_similarity(row):\n    try:\n        return gpt4_text_similarity(row[\'answer_us\'], row[\'model_answer_uk\'])\n    except Exception as e:\n        print(f"Error processing row: {e}")\n        return None\n\ndata[\'ukMA_usGT_score\'] = data.apply(apply_similarity, axis=1)\nprint("ukMA_usGT_score finished")\n\nprint("ukMA_usMA_score start")\ndef apply_similarity(row):\n    try:\n        return gpt4_text_similarity(row[\'model_answer_us\'], row[\'model_answer_uk\'])\n    except Exception as e:\n        print(f"Error processing row: {e}")\n        return None\n\

In [23]:
data.columns

Index(['Unnamed: 0', 'question', 'selections', 'options', 'options_formatted',
       'source', 'value_us', 'value_uk', 'answer_us', 'answer_uk', 'category',
       '# of options', 'question type', 'category_group', 'model_answer_us',
       'model_answer_uk', 'model_answer_uk_option_match',
       'model_answer_us_option_match', '#_options', 'options_dict',
       'score_ground_truth_answers', 'value_diff', 'question_type',
       'ukGT_usGT_score', 'ukMA_usGT_score', 'ukGT_usMA_score',
       'ukMA_usMA_score', 'us_score', 'uk_score'],
      dtype='object')

In [14]:
data = normalize_column(data, 'us_score', 'us_score')
data = normalize_column(data, 'uk_score', 'uk_score')
data = normalize_column(data, 'ukGT_usGT_score', 'ukGT_usGT_score')
data = normalize_column(data, 'ukMA_usGT_score', 'ukMA_usGT_score')
data = normalize_column(data, 'ukGT_usMA_score', 'ukGT_usMA_score')
data = normalize_column(data, 'ukMA_usMA_score', 'ukMA_usMA_score')
data = add_small_value(data, 'us_score')
data = add_small_value(data, 'uk_score')
data = add_small_value(data, 'ukGT_usGT_score')
data = add_small_value(data, 'ukMA_usGT_score')
data = add_small_value(data, 'ukGT_usMA_score')
data = add_small_value(data, 'ukMA_usMA_score')


In [15]:
merged_df = pd.concat([goqa, data], ignore_index=True)


In [16]:
inspect = merged_df[merged_df['ukGT_usGT_score'] < 0.9]
print("-----------------")
print("US-Score:", inspect.us_score.mean())
print("UK-Score:", inspect.uk_score.mean())
print("GroundTruth-Score:", inspect.ukGT_usGT_score.mean())
#print("UK-ModelAnswer_vs_US-GroundTruth-Score:", inspect.ukMA_usGT_score.mean())
#print("US-ModelAnswer_vs_UK-GroundTruth-Score:", inspect.ukGT_usMA_score.mean())
#print("UK-ModelAnswer_vs_US-ModelAnswer-Score:", inspect.ukMA_usMA_score.mean())
print("Data size:", len(inspect))
print("-----------------")

-----------------
US-Score: 0.5218134865438784
UK-Score: 0.4876089391022363
GroundTruth-Score: 0.40725766362437116
Data size: 1077
-----------------


In [17]:
merged_df.to_csv('data_merged_updated_similarity_zero_shot_results.csv', sep =';')

In [18]:
def if_significantly_different(result1: list, result2: list, P_VALUE_THRES=0.05):
    from scipy import stats
    import numpy as np
    score, p_value = stats.ttest_ind(result1, np.array(result2), equal_var=False)
    if_sign = p_value <= P_VALUE_THRES
    return if_sign

In [20]:
Us_similarity_scores_per_sample = merged_df['us_score'].tolist()
Uk_similarity_scores_per_sample = merged_df['uk_score'].tolist()
if_significantly_different(Us_similarity_scores_per_sample,Uk_similarity_scores_per_sample )

True