In [2]:
import pandas as pd
df = pd.read_csv('data_merged_gpt_llama_70B.csv', sep=';')

In [43]:
from bert_score import BERTScorer
import pandas as pd
import re
import contractions

scorer = BERTScorer(model_type='microsoft/deberta-xlarge-mnli', num_layers=40)
#scorer = BERTScorer(model_type='microsoft/deberta-v2-xxlarge-mnli', num_layers=22)


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = contractions.fix(text)
    return text


def keyword_matching(short_sentence, long_sentence):
    # Function to remove punctuation
    def remove_punctuation(text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)

    short_words = set(remove_punctuation(short_sentence).lower().split())
    long_words = set(remove_punctuation(long_sentence).lower().split())
    return any(word in long_words for word in short_words)

def calculate_similarity(data, language):
    similarity_scores = []

    for index, row in data.iterrows():
        if pd.isna(row['options']):
            if language == 'us': 
                sentence_1 = row['answer_us']
                sentence_2 = row['model_answer_us']
            elif language == 'uk': 
                sentence_1 = row['answer_uk']
                sentence_2 = row['model_answer_uk']
            elif language == 'ground_truths': 
                sentence_1 = row['answer_us']
                sentence_2 = row['answer_uk']
            # Apply keyword matching for short sentences
            if len(sentence_1.split()) <= 2 or len(sentence_2.split()) <= 2:
                if keyword_matching(sentence_1, sentence_2) or keyword_matching(sentence_2, sentence_1):
                    similarity_scores.append(1)
                else:
                    # Apply BERTScorer if keyword matching fails
                    if language == 'us': 
                        sentence_1 = row['answer_us']
                        sentence_2 = row['model_answer_us']
                    elif language == 'uk': 
                        sentence_1 = row['answer_uk']
                        sentence_2 = row['model_answer_uk']
                    elif language == 'ground_truths': 
                        sentence_1 = row['answer_us']
                        sentence_2 = row['answer_uk']
                    P, R, F = scorer.score([sentence_1], [sentence_2])
                    similarity_scores.append(P.item())
            else:
                # Apply BERTScorer for longer sentences
                if language == 'us': 
                    sentence_1 = row['answer_us']
                    sentence_2 = row['model_answer_us']
                elif language == 'uk': 
                    sentence_1 = row['answer_uk']
                    sentence_2 = row['model_answer_uk']
                elif language == 'ground_truths': 
                    sentence_1 = row['answer_us']
                    sentence_2 = row['answer_uk']
                P, R, F = scorer.score([sentence_1], [sentence_2])
                similarity_scores.append(P.item())
        else:
            similarity_scores.append(None)
        

    if language == 'us': 
        data['BERTScore_us_new'] = similarity_scores
    elif language == 'uk':
        data['BERTScore_uk_new'] = similarity_scores
    elif language == 'ground_truths':
        data['BERTScore_ground_truths'] = similarity_scores


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#calculate_similarity(df, language='us')
#calculate_similarity(df, language='uk')
calculate_similarity(df, language='ground_truths')

In [7]:
mean_score_us_new = df['BERTScore_us_new'].mean()
mean_score_uk_new = df['BERTScore_uk_new'].mean()
mean_score_us_uk_new = df['BERTScore_ground_truths'].mean()
print(mean_score_us_new, mean_score_uk_new, mean_score_us_uk_new)

0.6813332730105945 0.657805011016982 0.7397880572932107


In [6]:
df_GOQA = pd.read_csv('data_merged_GOQA_llama_70B.csv', sep=';')

In [7]:
df_GOQA.model_answer_us

0      (D), 9
1         xxx
2      (B), 6
3      (B), 6
4      (B), 6
        ...  
820    (A), 9
821    (F), 6
822    (B), 6
823    (B), 6
824    (B), 6
Name: model_answer_us, Length: 825, dtype: object

In [8]:
import pandas as pd

def find_na_rows(df):
    """
    Find the indices of rows in a DataFrame where either column 'a' or column 'b' contains the string 'n/a'.

    Parameters:
    df (pd.DataFrame): The DataFrame to search.

    Returns:
    list: A list of row indices where 'n/a' is found in either column 'a' or column 'b'.
    """
    # Check for the string 'n/a' in columns 'a' and 'b'
    na_condition = (df['model_answer_us'].astype(str) == 'xxx') | (df['model_answer_uk'].astype(str) == 'xxx')

    # Get the indices where the condition is True
    return df.index[na_condition].tolist()


In [9]:
len(find_na_rows(df_GOQA))

75

In [10]:
df_goqa_no_nan = df_GOQA[(df_GOQA['model_answer_us'] != 'xxx')]
df_goqa_no_nan = df_goqa_no_nan[df_goqa_no_nan['model_answer_uk'] != 'xxx']

In [11]:
df_goqa_no_nan.model_answer_us

0      (D), 9
2      (B), 6
3      (B), 6
4      (B), 6
5      (A), 9
        ...  
819    (B), 6
820    (A), 9
822    (B), 6
823    (B), 6
824    (B), 6
Name: model_answer_us, Length: 750, dtype: object

In [12]:
import ast 
def string_to_dict(string_list):
    # Check if the value is a string and not NaN
    if isinstance(string_list, str) and string_list != 'NaN':
        try:
            # Safely evaluate the string to a list
            actual_list = ast.literal_eval(string_list)

            # Convert to dictionary
            letter_dict = {}
            for i, item in enumerate(actual_list):
                key = chr(65 + i)  # Convert index to letter
                letter_dict[key] = item

            return letter_dict
        except (ValueError, SyntaxError):
            # Handle cases where the string cannot be evaluated to a list
            print(f"Skipping malformed string: {string_list}")
            return None
    else:
        return None


# Apply the function to each row in the DataFrame
df_goqa_no_nan['options_dict'] = df_goqa_no_nan['options'].apply(string_to_dict)

In [13]:
def get_value_from_dict(row):
    if row['options_dict'] is not None and isinstance(row['model_answer_us'], str):
        key = row['model_answer_us'].split(',')[0].strip()[1]  # Extracting the letter between the parentheses
        return row['options_dict'].get(key)
    else:
        return None

# Apply the function to map values
df_goqa_no_nan['model_answer_us_option_match'] = df_goqa_no_nan.apply(get_value_from_dict, axis=1)

# Display the resulting DataFrame
print(df_goqa_no_nan[['model_answer_us_option_match']])

                          model_answer_us_option_match
0    Relying too much on military force to defeat t...
2                                        Other reasons
3                               Sometimes be justified
4                                       Wrong decision
5                                  Keep troops in Iraq
..                                                 ...
819  Let people come as long as there are jobs avai...
820                                     Very important
822                                                 No
823                                           Disagree
824                   Economy growth and creating jobs

[750 rows x 1 columns]


In [14]:
def get_value_from_dict(row):
    if row['options_dict'] is not None and isinstance(row['model_answer_uk'], str):
        key = row['model_answer_uk'].split(',')[0].strip()[1]  # Extracting the letter between the parentheses
        return row['options_dict'].get(key)
    else:
        return None

# Apply the function to map values
df_goqa_no_nan['model_answer_uk_option_match'] = df_goqa_no_nan.apply(get_value_from_dict, axis=1)

# Display the resulting DataFrame
print(df_goqa_no_nan[['model_answer_uk_option_match']])

                          model_answer_uk_option_match
0    Relying too much on military force to defeat t...
2                                        Other reasons
3                               Sometimes be justified
4                                       Wrong decision
5                                  Keep troops in Iraq
..                                                 ...
819  Let people come as long as there are jobs avai...
820                                     Very important
822                                                 No
823                                           Disagree
824                             Protecting environment

[750 rows x 1 columns]


In [15]:
def calculate_score(row, language):
    # Handle cases for different scales and check for None in model_answer
    if row['question type'] in ['Likert Scale', 'Numerical Scale', 'Ordinal Scale'] and row['#_options'] > 2:

        if language == "us": 
            ground_truth = row['answer_us']
            model_answer = row['model_answer_us_option_match']
        elif language == "uk": 
            ground_truth = row['answer_uk']
            model_answer = row['model_answer_uk_option_match']

        # Check if model_answer is None
        if model_answer is None:
            return 0  # Or handle it in another way you see fit

        options = row['options']

        # Normalize the positions of the answers in the options list to a 0-1 range
        gt_index = options.index(ground_truth) / (len(options) - 1)
        model_index = options.index(model_answer) / (len(options) - 1)

        # Calculate the absolute error
        error = abs(gt_index - model_index)

        # Score can be inversely related to the error (1 - error)
        score = 1 - error
        return score
    else: 
        if language == "us":
            return int(row['answer_us'] == row['model_answer_us_option_match'])
        elif language == "uk":
            return int(row['answer_uk'] == row['model_answer_uk_option_match'])



In [18]:
df_goqa_no_nan['score_us'] = df_goqa_no_nan.apply(calculate_score, axis=1, language = "us")
df_goqa_no_nan['score_uk'] = df_goqa_no_nan.apply(calculate_score, axis=1, language = "uk")

In [17]:
columns_to_convert = ['question type', 'answer_us', 'model_answer_us_option_match', 
                      'answer_uk', 'model_answer_uk_option_match', 'options']

# Convert each of these columns to string
for column in columns_to_convert:
    df_goqa_no_nan[column] = df_goqa_no_nan[column].astype(str)

In [19]:
df_goqa_no_nan['score_us'].mean()

0.7746564454415539

In [20]:
df_goqa_no_nan['score_uk'].mean()

0.7999327990240402

In [24]:
df_goqa_no_nan['score_ground_truth_answers'].mean()

0.8595730663910008

In [25]:
df_goqa_no_nan

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,selections,options,options_formatted,source,...,score_us,overall_score_uk,similarity_score_uk,similarity_score_us,overall_score_us,similarity_model_answers_uk_us,similarity_ground_truth_answers_uk_us,options_dict,score_ground_truth_answers,value_diff
0,0,0,0,0,0,Which statement comes closer to your own views...,"{'United States': [0.0, 0.0, 0.54, 0.0, 0.0, 0...",['Using overwhelming military force is the bes...,(A) Using overwhelming military force is the b...,GAS,...,0.907388,,,,,,,{'A': 'Using overwhelming military force is th...,0.549428,
2,2,2,2,2,2,Do you think this change in the working condit...,"{'United States': [0.23157894736842105, 0.7684...","['Largely more connected', 'Other reasons']",(A) Largely more connected\n(B) Other reasons,GAS,...,1.000000,,,,,,,"{'A': 'Largely more connected', 'B': 'Other re...",1.000000,
3,3,3,3,3,3,Do you think that using military force against...,"{'United States': [0.22916666666666666, 0.4583...","['Often be justified', 'Sometimes be justified...",(A) Often be justified\n(B) Sometimes be justi...,GAS,...,1.000000,,,,,,,"{'A': 'Often be justified', 'B': 'Sometimes be...",1.000000,
4,4,4,4,4,4,"On the subject of Iraq, did your country make ...","{'United States': [0.5625, 0.4375], 'Britain':...","['Right decision', 'Wrong decision']",(A) Right decision\n(B) Wrong decision,GAS,...,0.000000,,,,,,,"{'A': 'Right decision', 'B': 'Wrong decision'}",0.000000,
5,5,5,5,5,5,Do you think the U.S. should keep military tro...,"{'United States': [0.3978494623655914, 0.60215...","['Keep troops in Iraq', 'Remove its troops']",(A) Keep troops in Iraq\n(B) Remove its troops,GAS,...,0.000000,,,,,,,"{'A': 'Keep troops in Iraq', 'B': 'Remove its ...",1.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,819,819,819,819,819,How about people from other countries coming h...,"{'United States': [0.133, 0.37, 0.442000000000...","['Let anyone come who wants to', 'Let people c...",(A) Let anyone come who wants to\n(B) Let peop...,WVS,...,0.000000,,,,,,,"{'A': 'Let anyone come who wants to', 'B': 'Le...",0.000000,
820,820,820,820,820,820,"For each of the following aspects, indicate ho...","{'United States': [0.394, 0.40299999999999997,...","['Very important', 'Rather important', 'Not ve...",(A) Very important\n(B) Rather important\n(C) ...,WVS,...,0.884615,,,,,,,"{'A': 'Very important', 'B': 'Rather important...",0.884615,
822,822,822,822,822,822,In which of the following things do you believ...,"{'United States': [0.682, 0.284, 0.002, 0.032,...","['Yes', 'No', ""Don't know"", 'No answer', 'Othe...",(A) Yes\n(B) No\n(C) Don't know\n(D) No answer...,WVS,...,0.000000,,,,,,,"{'A': 'Yes', 'B': 'No', 'C': 'Don't know', 'D'...",0.000000,
823,823,823,823,823,823,"Do you agree, disagree or neither agree nor di...","{'United States': [0.452, 0.306, 0.239, 0.0, 0...","['Agree', 'Disagree', 'Neither agree nor disag...",(A) Agree\n(B) Disagree\n(C) Neither agree nor...,WVS,...,0.926829,,,,,,,"{'A': 'Agree', 'B': 'Disagree', 'C': 'Neither ...",0.926829,


In [26]:
df_goqa_no_nan.to_csv('data_merged_GOQA_llama_70B_results.csv', sep=';')

In [22]:
# Score for questions with answer options
def calculate_score_gt(row):
    if row['question type'] in ['Likert Scale', 'Numerical Scale', 'Ordinal Scale'] and row['#_options'] > 2:
 
        ground_truth = row['answer_us']
        model_answer = row['answer_uk']
        options = row['options']

        # Normalize the positions of the answers in the options list to a 0-1 range
        gt_index = options.index(ground_truth) / (len(options) - 1)
        model_index = options.index(model_answer) / (len(options) - 1)

        # Calculate the absolute error
        error = abs(gt_index - model_index)

        # Score can be inversely related to the error (1 - error)
        score = 1 - error
        return score
    else: 
        return int(row['answer_us'] == row['answer_uk'])


In [23]:
df_goqa_no_nan['score_ground_truth_answers'] = df_goqa_no_nan.apply(calculate_score_gt, axis=1)

In [30]:
df_goqa_no_nan['value_us'] = df_goqa_no_nan['value_us'].astype(float)
df_goqa_no_nan['value_uk'] = df_goqa_no_nan['value_uk'].astype(float)

ValueError: could not convert string to float: '0,540001'

In [None]:
import ast 
df_goqa_no_nan['selections'] = df_goqa_no_nan['selections'].apply(ast.literal_eval)

In [34]:
import pandas as pd
import ast  # Import the ast module

# ... (other parts of your script)

# Function to convert values in the list to float and get the max
def get_max_float_value(value_list):
    # Convert all items to float, ignoring non-numeric values
    converted_list = []
    for item in value_list:
        try:
            converted_list.append(float(item))
        except (ValueError, TypeError):
            continue  # Skip non-numeric items

    if converted_list:
        return max(converted_list)
    else:
        return 0.0  # Return 0.0 if the list is empty or conversion failed

# Function to extract max values and ensure they are floats
def extract_max_values(row):
    value_us = get_max_float_value(row['selections'].get('United States', []))
    value_uk = get_max_float_value(row['selections'].get('Britain', row['selections'].get('Great Britain', [])))
    return pd.Series([value_us, value_uk], index=['value_us', 'value_uk'])

# Apply the function to each row
df_goqa_no_nan[['value_us', 'value_uk']] = df_goqa_no_nan.apply(extract_max_values, axis=1)


In [36]:
def calculate_difference(row):
    return 1 - abs(row['value_us'] - row['value_uk'])

# Apply the function to each row and create a new column for the difference
df_goqa_no_nan['value_diff'] = df_goqa_no_nan.apply(calculate_difference, axis=1)

In [51]:
df_goqa_no_nan

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,selections,options,options_formatted,source,...,score_us,overall_score_uk,similarity_score_uk,similarity_score_us,overall_score_us,similarity_model_answers_uk_us,similarity_ground_truth_answers_uk_us,options_dict,score_ground_truth_answers,value_diff
0,0,0,0,0,0,Which statement comes closer to your own views...,"{'United States': [0.0, 0.0, 0.54, 0.0, 0.0, 0...",['Using overwhelming military force is the bes...,(A) Using overwhelming military force is the b...,GAS,...,0.907388,,,,,,,{'A': 'Using overwhelming military force is th...,0.549428,0.810254
2,2,2,2,2,2,Do you think this change in the working condit...,"{'United States': [0.23157894736842105, 0.7684...","['Largely more connected', 'Other reasons']",(A) Largely more connected\n(B) Other reasons,GAS,...,1.000000,,,,,,,"{'A': 'Largely more connected', 'B': 'Other re...",1.000000,0.876740
3,3,3,3,3,3,Do you think that using military force against...,"{'United States': [0.22916666666666666, 0.4583...","['Often be justified', 'Sometimes be justified...",(A) Often be justified\n(B) Sometimes be justi...,GAS,...,1.000000,,,,,,,"{'A': 'Often be justified', 'B': 'Sometimes be...",1.000000,0.948129
4,4,4,4,4,4,"On the subject of Iraq, did your country make ...","{'United States': [0.5625, 0.4375], 'Britain':...","['Right decision', 'Wrong decision']",(A) Right decision\n(B) Wrong decision,GAS,...,0.000000,,,,,,,"{'A': 'Right decision', 'B': 'Wrong decision'}",0.000000,0.986413
5,5,5,5,5,5,Do you think the U.S. should keep military tro...,"{'United States': [0.3978494623655914, 0.60215...","['Keep troops in Iraq', 'Remove its troops']",(A) Keep troops in Iraq\n(B) Remove its troops,GAS,...,0.000000,,,,,,,"{'A': 'Keep troops in Iraq', 'B': 'Remove its ...",1.000000,0.966031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,819,819,819,819,819,How about people from other countries coming h...,"{'United States': [0.133, 0.37, 0.442000000000...","['Let anyone come who wants to', 'Let people c...",(A) Let anyone come who wants to\n(B) Let peop...,WVS,...,0.000000,,,,,,,"{'A': 'Let anyone come who wants to', 'B': 'Le...",0.000000,0.868000
820,820,820,820,820,820,"For each of the following aspects, indicate ho...","{'United States': [0.394, 0.40299999999999997,...","['Very important', 'Rather important', 'Not ve...",(A) Very important\n(B) Rather important\n(C) ...,WVS,...,0.884615,,,,,,,"{'A': 'Very important', 'B': 'Rather important...",0.884615,0.994000
822,822,822,822,822,822,In which of the following things do you believ...,"{'United States': [0.682, 0.284, 0.002, 0.032,...","['Yes', 'No', ""Don't know"", 'No answer', 'Othe...",(A) Yes\n(B) No\n(C) Don't know\n(D) No answer...,WVS,...,0.000000,,,,,,,"{'A': 'Yes', 'B': 'No', 'C': 'Don't know', 'D'...",0.000000,0.843000
823,823,823,823,823,823,"Do you agree, disagree or neither agree nor di...","{'United States': [0.452, 0.306, 0.239, 0.0, 0...","['Agree', 'Disagree', 'Neither agree nor disag...",(A) Agree\n(B) Disagree\n(C) Neither agree nor...,WVS,...,0.926829,,,,,,,"{'A': 'Agree', 'B': 'Disagree', 'C': 'Neither ...",0.926829,0.999000


In [37]:
df_final = df_goqa_no_nan[df_goqa_no_nan['value_diff'] < 0.9]

In [54]:
len(df_goqa_no_nan)

750

In [52]:
df_goqa_no_nan.score_uk.mean()

0.7999327990240402

In [53]:
df_goqa_no_nan.score_us.mean()

0.7746564454415539

In [42]:
df_quiz = pd.read_csv('data_merged_quiz_llama_70B.csv', sep=';')

In [44]:
calculate_similarity(df_quiz, language='us')
calculate_similarity(df_quiz, language='uk')
calculate_similarity(df_quiz, language='ground_truths')

In [47]:
df_quiz.BERTScore_us_new.mean()

0.8914371092342636

In [48]:
df_quiz.BERTScore_uk_new.mean()

0.8913049570565085

In [49]:
df_quiz.BERTScore_ground_truths.mean()

0.8630289299395478

In [50]:
len(df_quiz)

103