In [3]:
import pandas as pd
data = pd.read_csv('../data/data_merged.csv', sep=';')

Map model answer to answer options for questions with answer options

In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def parse_options(options_str):
    return options_str.strip('[]').split(', ')

def preprocess_text(text):
    # Lowercase, remove punctuation, and tokenize
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

def extract_and_compare(row):
    if pd.notna(row['options']):
        # Case 1: Pre-defined options
        options = parse_options(row['options'])
        # switch to model_answer_us for american version
        model_answer = 'model_answer_us'
        preprocessed_answer = preprocess_text(row[model_answer])
        preprocessed_options = [preprocess_text(option) for option in options]

        # Use BERT embeddings for semantic similarity
        answer_embedding = model.encode(preprocessed_answer)
        option_embeddings = model.encode(preprocessed_options)
        similarities = util.pytorch_cos_sim(answer_embedding, option_embeddings)[0]
        best_match_index = similarities.argmax().item()

        best_match = options[best_match_index].strip()
        return best_match
    else:
        # Case 2: Open-ended
        return None
#data['model_answer_uk_option_match'] = data.apply(extract_and_compare, axis=1, result_type='expand')
data['model_answer_us_option_match'] = data.apply(extract_and_compare, axis=1, result_type='expand')

[nltk_data] Downloading package punkt to /Users/nils/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/nils/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Count number of options for later score calculation

import ast
def parse(options_str):
    try:
        # Safely evaluate the string as a Python literal (list)
        return ast.literal_eval(options_str)
    except ValueError:
        return []

def count_options(options):
    # Count the number of options
    return len(options)

data['#_options'] = data['options'].apply(lambda x: count_options(parse(x)))

In [13]:
def remove_quotes(df, column_name):
    # Apply the function to each row in the specified column
    df[column_name] = df[column_name].apply(lambda x: x[1:-1] if x and x.startswith("'") and x.endswith("'") else x)
    return df

# Apply the function to every row in the specified column
#data = remove_quotes(data, 'model_answer_us_option_match')
#data = remove_quotes(data, 'model_answer_uk_option_match')

In [15]:
# Score for questions with answer options
def calculate_score(row):
    if row['question type'] in ['Likert Scale', 'Numerical Scale', 'Ordinal Scale'] and row['#_options'] > 2:
        ground_truth = row['answer_us']
        model_answer = row['model_answer_us_option_match']
        options = row['options']

        # Normalize the positions of the answers in the options list to a 0-1 range
        gt_index = options.index(ground_truth) / (len(options) - 1)
        model_index = options.index(model_answer) / (len(options) - 1)

        # Calculate the absolute error
        error = abs(gt_index - model_index)

        # Score can be inversely related to the error (1 - error)
        score = 1 - error
        return score
    else: 
        return int(row['answer_us'] == row['model_answer_us_option_match'])
# Apply the scoring function to each row
data['score_us'] = data.apply(calculate_score, axis=1)
#data['score_us_neutral'] = data.apply(calculate_score, axis=1)

In [16]:
data.head()

Unnamed: 0.1,Unnamed: 0,question,selections,options,source,value_us,value_uk,index_us,index_uk,answer_us,...,overall_score_us,similarity_model_answers_uk_us,similarity_ground_truth_answers_uk_us,model_answer_neutral_option_match,score_us_neutral,score_uk_neurtal,similarity_score_us_vs_neutral,similarity_score_uk_vs_neutral,overall_score_uk_neutral,overall_score_us_neutral
0,0,Which statement comes closer to your own views...,"'United States': [0.0, 0.0, 0.54, 0.0, 0.0, 0....",['Using overwhelming military force is the bes...,GAS,0.54,0.350254,2.0,7.0,Many of the problems facing our country can be...,...,0.907388,0.691288,0.661478,Relying too much on military force to defeat t...,0.907388,0.64204,,,0.64204,0.907388
1,1,"In your opinion, how strong a sense of Islamic...","'United States': [0.24705882352941178, 0.44705...","['Very strong', 'Fairly strong', 'Not too stro...",GAS,0.447059,0.539326,1.0,1.0,Fairly strong,...,0.785714,0.891167,1.0,Very strong,0.785714,0.785714,,,0.785714,0.785714
2,2,Do you think this change in the working condit...,"'United States': [0.23157894736842105, 0.76842...","['Largely more connected', 'Other reasons']",GAS,0.768421,0.645161,1.0,1.0,Other reasons,...,1.0,1.0,1.0,Other reasons,1.0,1.0,,,1.0,1.0
3,3,Do you think that using military force against...,"'United States': [0.22916666666666666, 0.45833...","['Often be justified', 'Sometimes be justified...",GAS,0.458333,0.510204,1.0,1.0,Sometimes be justified,...,0.717391,0.923221,1.0,Rarely be justified,0.717391,0.717391,,,0.717391,0.717391
4,4,"On the subject of Iraq, did your country make ...","'United States': [0.5625, 0.4375], 'Britain': ...","['Right decision', 'Wrong decision']",GAS,0.5625,0.576087,0.0,1.0,Right decision,...,0.0,0.929168,0.807663,Wrong decision,0.0,1.0,,,1.0,0.0


In [72]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Initialize an empty list to store similarity scores
similarity_scores = []

# Iterate through the DataFrame rows and calculate similarity scores
for index, row in data.iterrows():
    if pd.isna(row['options']):
        # Tokenize the sentences in this row
        encoded_input = tokenizer([row['answer_uk'], row['model_answer_neutral']], padding=True, truncation=True, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Perform pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

        # Calculate cosine similarity between the two sentences in this row
        similarity_score = cosine_similarity(sentence_embeddings.numpy())[0, 1]
        
        # Append the similarity score to the list
        similarity_scores.append(similarity_score)
    else:
        # If 'options' is not NaN, add a None value to indicate no comparison
        similarity_scores.append(None)
# Add the list of similarity scores as a new column in the DataFrame
data['similarity_score_uk_vs_neutral'] = similarity_scores


In [18]:
def combined_score(row):
    # switch to _us for american version
    return row['similarity_score_uk'] if not pd.isna(row['similarity_score_uk']) else row['score_uk']

# Apply the function to create a third column
data['overall_score_uk'] = data.apply(combined_score, axis=1)
#data['overall_score_us'] = data.apply(combined_score, axis=1)

In [52]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Initialize an empty list to store similarity scores
similarity_scores = []

# Iterate through the DataFrame rows and calculate similarity scores
for index, row in data.iterrows():

    # Tokenize the sentences in this row
    encoded_input = tokenizer([row['answer_uk'], row['answer_us']], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    # Calculate cosine similarity between the two sentences in this row
    similarity_score = cosine_similarity(sentence_embeddings.numpy())[0, 1]
    
    # Append the similarity score to the list
    similarity_scores.append(similarity_score)

# Add the list of similarity scores as a new column in the DataFrame
data['similarity_ground_truth_answers_uk_us'] = similarity_scores




In [19]:
mean_score_us = data['overall_score_us'].mean()
mean_score_uk = data['overall_score_uk'].mean()
mean_score_model_answers = data['similarity_model_answers_uk_us'].mean()

print(mean_score_us, mean_score_uk, mean_score_model_answers)

0.6042390735646993 0.5937065266892467 0.7588860439451516


In [62]:
""" --> rather stick to similarity
Distance = 1 - similarity

Distance UK-US = 0.25 (comparison of model_answer_us vs model_answer_uk)
Distance GPT - US = 0.40 (comparison of model_answer_us vs answer_us)
Distance GPT - UK = 0.41 (comparison of model_answer_uk vs answer_uk)

# Group by Topic
Dist. by group:         GPT-US        GPT-UK 
Economy:                0.35          0.38
Lifestyle:              0.51          0.53
Education:              0.46          0.50
Politics:               0.29          0.25
Social Dynamics:        0.41          0.43
 

# Distance scores if we prompt without context
No Context: 
Distance GPT - US = 0.40 
Distance GPT - UK = 0.39

Have some 1-2 sentence takeaway for the readers, GPT -> USA, 
What keywords in the hint make it more sensitive? Which words trigger more?
Group by source to debug


The longer the text the longer the magnitude --> we want semantic alignment without regards to length
Maybe run summarization before 
-> get higher distance between us-uk vs gpt-uk gpt-us

ignore length, check prompts (short answers in ground truth)
check BART, BERT scores


"""


' \nDistance UK-US = 0.25\nDistance GPT - US = 0.4\nDistance GPT - UK = 0.4\n\nDist. by group:         GPT-US        GPT-UK \nEconomy:                0.35          0.38\nLifestyle:              0.51          0.53\nEducation:              0.46          0.50\nPolitics:               0.29          0.25\nSocial Dynamics:        0.41          0.43\n \n'

In [28]:
filtered_data = data[data['source'] != 'GPT-4 generated']

mean_score_us = filtered_data['overall_score_uk'].mean()
mean_score_uk = filtered_data['overall_score_us'].mean()
answer_similarity = filtered_data['similarity_model_answers_uk_us'].mean()
print(mean_score_us, mean_score_uk, answer_similarity)

0.7259752420946044 0.7137700405132223 0.734844789458421


In [30]:
mean_score_us = data['overall_score_uk'].mean()
mean_score_uk = data['overall_score_us'].mean()
mean_score_model_answers = data['similarity_model_answers_uk_us'].mean()
print(mean_score_us, mean_score_uk, mean_score_model_answers)

0.5937065266892467 0.6042390735646993 0.7588860439451516


In [21]:
mean_score_us = data.groupby(['source'])['overall_score_us'].mean()
mean_score_uk = data.groupby(['source'])['overall_score_uk'].mean()
mean_score_model_answers = data.groupby(['source'])['similarity_model_answers_uk_us'].mean()


print(mean_score_us, mean_score_uk, mean_score_model_answers)

source
GAS                                                               0.697443
GPT / https://www.anadventurousworld.com/usa-trivia-questions/    0.746191
GPT / https://www.beelovedcity.com/england-quiz                   0.618607
GPT-4 generated                                                   0.455590
WVS                                                               0.797352
Name: overall_score_us, dtype: float64 source
GAS                                                               0.720277
GPT / https://www.anadventurousworld.com/usa-trivia-questions/    0.573578
GPT / https://www.beelovedcity.com/england-quiz                   0.760081
GPT-4 generated                                                   0.414199
WVS                                                               0.754794
Name: overall_score_uk, dtype: float64 source
GAS                                                               0.760189
GPT / https://www.anadventurousworld.com/usa-trivia-questions/    0.591241
G

In [57]:
import pandas as pd
data_short_answers = pd.read_csv('../data/data_merged_short.csv', sep=';')

In [60]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def calculate_similarity(data, model_choice='bert-base-uncased'):
    # Load tokenizer and model based on the chosen model
    tokenizer = AutoTokenizer.from_pretrained(model_choice)
    model = AutoModel.from_pretrained(model_choice)

    similarity_scores = []

    for index, row in data.iterrows():
        if pd.isna(row['options']):
            # Tokenize the sentences in this row
            encoded_input = tokenizer([row['answer_us'], row['model_answer_us']], padding=True, truncation=True, return_tensors='pt')
            
            # Compute token embeddings
            with torch.no_grad():
                model_output = model(**encoded_input)

            # Perform pooling
            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

            # Normalize embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

            # Calculate cosine similarity
            similarity_score = cosine_similarity(sentence_embeddings.numpy())[0, 1]
            
            similarity_scores.append(similarity_score)
        else:
            # If 'options' is not NaN, add a None value to indicate no comparison
            similarity_scores.append(None)

    # Add similarity scores to the DataFrame
    data['similarity_score_us_bert_base_uncased'] = similarity_scores


calculate_similarity(data_short_answers, model_choice='bert-base-uncased')


In [232]:
# BERT score
mean_score_us = data['similarity_score_us_bert_base_uncased'].mean()
mean_score_uk = data['similarity_score_uk_bert_base_uncased'].mean()
ground_truths = data['similarity_score_uk_us_ground_truth_bert_base_uncased'].mean()
print("GPT/US: "+ str(mean_score_us), 
      "\nGPT/UK: "+ str(mean_score_uk),  
      "\nUK/US Ground Truth: "+ str(ground_truths))

GPT/US: 0.6758108527506169 
GPT/UK: 0.6518393742356134 
UK/US Ground Truth: 0.702187361149349


In [37]:
# BART score
mean_score_us = data['similarity_score_us_bart_large'].mean()
mean_score_uk = data['similarity_score_uk_bart_large'].mean()
print(mean_score_us, mean_score_uk)

0.6600821561907506 0.6315394621991103


In [40]:
# sentence-transformers/all-mpnet-base-v2
mean_score_us = data['similarity_score_us_mpnet_base_v2'].mean()
mean_score_uk = data['similarity_score_uk_mpnet_base_v2'].mean()
print(mean_score_us, mean_score_uk)

0.481437337974003 0.4519809109369636


In [45]:
def combined_score(row):
    # switch to _us for american version
    return row['similarity_score_us_bert_base_uncased'] if not pd.isna(row['similarity_score_us_bert_base_uncased']) else row['score_us']

# Apply the function to create a third column
#data['overall_score_uk_bert'] = data.apply(combined_score, axis=1)
#data['overall_score_us_bert'] = data.apply(combined_score, axis=1)

In [47]:
mean_score_us = data['overall_score_uk_bert'].mean()
mean_score_uk = data['overall_score_us_bart'].mean()
print(mean_score_us, mean_score_uk)

0.6910921051693161 0.6989519862194856


In [61]:
mean_score_us = data_short_answers['similarity_score_us_bert_base_uncased'].mean()
mean_score_uk = data_short_answers['similarity_score_uk_bert_base_uncased'].mean()
print(mean_score_us, mean_score_uk)

0.6735492492493113 0.6557561493112674


list

In [234]:
from bert_score import BERTScorer
import pandas as pd
import re
import contractions

#scorer = BERTScorer(model_type='microsoft/deberta-xlarge-mnli', num_layers=40)
scorer = BERTScorer(model_type='microsoft/deberta-v2-xxlarge-mnli', num_layers=22)


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = contractions.fix(text)
    return text


def keyword_matching(short_sentence, long_sentence):
    # Function to remove punctuation
    def remove_punctuation(text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)

    short_words = set(remove_punctuation(short_sentence).lower().split())
    long_words = set(remove_punctuation(long_sentence).lower().split())
    return any(word in long_words for word in short_words)

def calculate_similarity(data, language):
    similarity_scores = []

    for index, row in data.iterrows():
        if pd.isna(row['options']):
            if language == 'us': 
                sentence_1 = row['answer_us']
                sentence_2 = row['model_answer_us']
            elif language == 'uk': 
                sentence_1 = row['answer_uk']
                sentence_2 = row['model_answer_uk']

            # Apply keyword matching for short sentences
            if len(sentence_1.split()) <= 2 or len(sentence_2.split()) <= 2:
                if keyword_matching(sentence_1, sentence_2) or keyword_matching(sentence_2, sentence_1):
                    similarity_scores.append(1)
                else:
                    # Apply BERTScorer if keyword matching fails
                    if language == 'us': 
                        sentence_1 = row['answer_us']
                        sentence_2 = row['model_answer_us']
                    elif language == 'uk': 
                        sentence_1 = row['answer_uk']
                        sentence_2 = row['model_answer_uk']
                    P, R, F = scorer.score([sentence_1], [sentence_2])
                    similarity_scores.append(P.item())
            else:
                # Apply BERTScorer for longer sentences
                if language == 'us': 
                    sentence_1 = row['answer_us']
                    sentence_2 = row['model_answer_us']
                elif language == 'uk': 
                    sentence_1 = row['answer_uk']
                    sentence_2 = row['model_answer_uk']

                P, R, F = scorer.score([sentence_1], [sentence_2])
                similarity_scores.append(P.item())
        else:
            similarity_scores.append(None)
        

    if language == 'us': 
        data['BERTScore_us'] = similarity_scores
    elif language == 'uk':
        data['BERTScore_uk'] = similarity_scores


Downloading tokenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 28.6kB/s]
Downloading config.json: 100%|██████████| 952/952 [00:00<00:00, 7.49MB/s]
Downloading spm.model: 100%|██████████| 2.45M/2.45M [00:00<00:00, 3.58MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading pytorch_model.bin: 100%|██████████| 3.13G/3.13G [07:19<00:00, 7.13MB/s]


In [238]:
calculate_similarity(data, language='us')
calculate_similarity(data, language='uk')

In [239]:
mean_score_us = data['combine_BERT_us'].mean()
mean_score_uk = data['combine_BERT_uk'].mean()
print(mean_score_us, mean_score_uk)

0.7046956235753769 0.7068167020562399


In [231]:
mean_score_us = data['combine_BERT_us'].mean()
mean_score_uk = data['combine_BERT_uk'].mean()
print(mean_score_us, mean_score_uk)

0.7046956235753769 0.7068167020562399


In [102]:
# 40 layers
mean_score_us = data['BERTScore_us'].mean()
mean_score_uk = data['BERTScore_uk'].mean()
print(mean_score_us, mean_score_uk)

0.6767765270610947 0.6697207021513092


In [139]:
pd.set_option('display.max_colwidth', None)
inspect = data[data['BERTScore_us'] < 0.4]
inspect[['answer_us', 'model_answer_us', 'BERTScore_us']]

# filter out cases with no valid answer? 

Unnamed: 0,answer_us,model_answer_us
871,Washington D.C.,"I'm sorry, I think there might be a misunderstanding. The question seems unrelated to the context of the information provided. Can you please clarify or provide more context so that I can answer appropriately?"
879,Usually just sugar or nothing.,I prefer to put a little bit of honey in my tea for some extra sweetness.
931,Generally expected; 15-20% in restaurants is standard.,"Absolutely, I believe in tipping as a way to show appreciation for good service and acknowledge the hard work of those in the service industry."
950,Rose,"I'm sorry, but I don't see how the national flower is relevant to the discussion about Carlos and his experiences in New York City."
953,Bus,The subway is a popular form of public transport in New York City.
961,"Monday to Friday, 9-5","A typical work week for me involves commuting on the subway, grabbing a coffee and bagel from the deli, engaging in diverse conversations, and actively participating in local elections."
974,"I have no strong opinion, either way tastes good.",Cream then jam.
1181,"Occasionally, but not always.","Yes, I do buy organic products because I value their health and environmental benefits."
1216,"Yes, during specific religious events.",I can tell you that my personal religious beliefs and practices are private and not relevant to our discussion about Wall Street and investments.
1410,"Yes, they're convenient.","As someone who values home-cooked meals and quality time with family, I do use meal kits to simplify cooking and ensure we have a balanced and delicious dinner on busy weeknights."


In [162]:
pd.set_option('display.max_colwidth', None)
inspect = data[data['BERTScore_us'] < 0.4]
inspect[['answer_us', 'model_answer_us', 'BERTScore_us']]

# Answer matching for cases where the ground truth answer has only 1-2 words

Unnamed: 0,answer_us,model_answer_us,BERTScore_us
950,Rose,"I'm sorry, but I don't see how the national flower is relevant to the discussion about Carlos and his experiences in New York City.",0.382822
1564,"Glastonbury is located in England, United Kingdom","I'm sorry, I cannot assist with that.",0.347967
1569,"A famous female author, Jane Austen, is buried in Winchester Cathedral, England, United Kingdom","Sorry, I am not able to provide that information.",0.395408
1573,"The birthplace of Charles Darwin was located in Shrewsbury, England, United Kingdom","I'm sorry, I think there might be some misunderstanding. Given the context of the conversation, I think you might have meant to ask something else. Can you please clarify your question?",0.381976


In [118]:
pd.set_option('display.max_colwidth', None)
inspect = data[(data['BERTScore_us'] < 0.5) & (data['BERTScore_us'] > 0.4)]
inspect[['answer_us', 'model_answer_us']]

Unnamed: 0,answer_us,model_answer_us
879,Usually just sugar or nothing.,I prefer to put a little bit of honey in my tea for some extra sweetness.
931,Generally expected; 15-20% in restaurants is standard.,"Absolutely, I believe in tipping as a way to show appreciation for good service and acknowledge the hard work of those in the service industry."
941,21,The legal drinking age in the United States is 21.
944,Coachella,"The famous festival in my community is the Puerto Rican Day Parade, celebrating our rich heritage and culture."
945,Hamburger,"One popular cuisine in New York City is definitely pizza, with its rich and diverse variations reflecting the city's vibrant culinary scene."
952,"Hey or ""Hi""","Hey, how's it going?"
953,Bus,The subway is a popular form of public transport in New York City.
961,"Monday to Friday, 9-5","A typical work week for me involves commuting on the subway, grabbing a coffee and bagel from the deli, engaging in diverse conversations, and actively participating in local elections."
1470,"Always, it's how it works here.","Yes, I believe in tipping service workers to show appreciation for their hard work and dedication."
1476,"I used to, but it's mostly e-cards now.GPT-4 generated","Yes, I send holiday cards to friends and family every year as a way to stay connected and spread cheer."


In [219]:
def find_rows_with_keywords(data, keywords, columns):
    """
    Find rows where specified columns contain any of the given keywords.

    :param data: pandas DataFrame to search in.
    :param keywords: List of keywords to search for.
    :param columns: List of column names to check.
    :return: List of row indexes where any of the keywords are found in any of the specified columns.
    """
    indexes = []

    # Iterate through the DataFrame
    for index, row in data.iterrows():
        # Check each specified column for keywords
        for column in columns:
            if any(keyword.lower() in (row[column] or "").lower() for keyword in keywords):
                indexes.append(index)
                break  # Break to avoid adding the same row index multiple times

    return indexes

# Example usage
keywords = ["sorry"]
columns = ["model_answer_us", "model_answer_uk"]
rows_with_keywords = find_rows_with_keywords(data, keywords, columns)
print("Indexes of rows with keywords:", rows_with_keywords)


Indexes of rows with keywords: [168, 179, 265, 282, 443, 547, 609, 649, 768, 777, 849, 861, 870, 871, 950, 1006, 1043, 1202, 1213, 1251, 1259, 1357, 1359, 1380, 1390, 1546, 1547, 1549, 1550, 1552, 1553, 1555, 1564, 1565, 1569, 1573, 1575, 1577, 1584, 1585, 1586, 1589, 1590, 1594, 1595, 1596, 1598, 1599, 1601, 1602, 1604, 1605, 1606, 1609, 1616, 1617, 1618, 1619, 1621, 1624, 1625, 1626, 1629, 1632, 1635, 1636, 1637, 1640, 1647, 1648, 1649]


In [220]:
def drop_rows(dataframe, row_indices):
    # Drop the rows and reset the index
    return dataframe.drop(row_indices).reset_index(drop=True)

filtered_data = drop_rows(data, rows_with_keywords)

In [221]:
mean_score_us = filtered_data['BERTScore_us'].mean()
mean_score_uk = filtered_data['BERTScore_uk'].mean()
print(mean_score_us, mean_score_uk)

0.6939173546966922 0.6843770845100023


In [243]:
pd.set_option('display.max_colwidth', None)
inspect = filtered_data[filtered_data['BERTScore_us'] < 0.5]
inspect[['question', 'answer_us', 'model_answer_us', 'answer_uk', 'model_answer_uk', 'BERTScore_us', 'BERTScore_uk']]

# Answer matching for cases where the ground truth answer has only 1-2 words

Unnamed: 0,question,answer_us,model_answer_us,answer_uk,model_answer_uk,BERTScore_us,BERTScore_uk
865,What do you put in your tea?,Usually just sugar or nothing.,I prefer to put a little bit of honey in my tea for some extra sweetness.,Milk and perhaps sugar.,I enjoy a splash of milk in my tea.,0.451008,0.59963
917,What do you think of tipping?,Generally expected; 15-20% in restaurants is standard.,"Absolutely, I believe in tipping as a way to show appreciation for good service and acknowledge the hard work of those in the service industry.",Not obligatory; often around 10% in restaurants if the service was good.,I believe in showing appreciation for good service and always tip accordingly.,0.472581,0.589812
930,What's a famous festival?,Coachella,"The famous festival in my community is the Puerto Rican Day Parade, celebrating our rich heritage and culture.",Glastonbury Festival,"One of the most famous festivals in the UK is Glastonbury, known for its diverse music lineup and vibrant atmosphere.",0.460217,1.0
931,What's a popular cuisine?,Hamburger,"One popular cuisine in New York City is definitely pizza, with its rich and diverse variations reflecting the city's vibrant culinary scene.",Fish and Chips,"In my opinion, Italian cuisine is very popular and widely enjoyed for its delicious pasta, pizza, and gelato.",0.492221,0.635277
937,What's a typical greeting?,"Hey or ""Hi""","Hey, how's it going?","Hello or ""Hi""","Hey there, how's it going?",0.47889,0.444272
938,What's a popular form of public transport?,Bus,The subway is a popular form of public transport in New York City.,Train,"The tube, or the London Underground, is a popular form of public transport in London.",0.401876,0.434107
946,What's a typical work week?,"Monday to Friday, 9-5","A typical work week for me involves commuting on the subway, grabbing a coffee and bagel from the deli, engaging in diverse conversations, and actively participating in local elections.","Monday to Friday, 9-5","A typical work week for me involves commuting on public transportation, meeting friends at the local pub, and engaging in lively conversations with neighbors about community events and the latest football match.",0.475479,0.455866
1445,Do you tip service workers?,"Always, it's how it works here.","Yes, I believe in tipping service workers to show appreciation for their hard work and dedication.","In restaurants, yes.","Yes, I believe in tipping service workers as a way to show appreciation for their hard work and dedication.",0.490293,0.74916
1451,Do you send holiday cards?,"I used to, but it's mostly e-cards now.GPT-4 generated","Yes, I send holiday cards to friends and family every year as a way to stay connected and spread cheer.","Yes, it's a nice touch.","Yes, I enjoy sending holiday cards to friends and family to spread cheer during the festive season.",0.481109,0.744557


In [244]:
inspect.to_csv('inspect.csv', sep=';')

In [233]:
""" 
When using BARTScore to assess similarity, it's more about how each sentence individually 
aligns with the kind of text the model expects or is trained on, rather than a direct comparison 
of similarity between the two sentences.

If you're using these scores to assess similarity, one approach might be to consider sentences 
with closer scores as having similar qualities in terms of fluency or alignment with the model's training. 
However, this is an indirect and somewhat speculative use of these scores.

!!! BARTScore is primarily for evaluating text generation quality (like summarization, translation) against a reference text !!!

"""

from bart_score import BARTScorer
bart_scorer = BARTScorer(device='cpu', checkpoint='facebook/bart-large-cnn')
texts = ["This is a sample sentence.", "This is another sample sentence."]

# Compute BARTScores
scores = bart_scorer.score(texts, texts, batch_size=4)

# Output the scores
print(scores)



[-0.5716381072998047, -0.5296772122383118]
