In [65]:
import pandas as pd
data = pd.read_csv('../data/data_merged.csv', sep=';')

Map model answer to answer options for questions with answer options

In [66]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def parse_options(options_str):
    return options_str.strip('[]').split(', ')

def preprocess_text(text):
    # Lowercase, remove punctuation, and tokenize
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

def extract_and_compare(row):
    if pd.notna(row['options']):
        # Case 1: Pre-defined options
        options = parse_options(row['options'])
        # switch to model_answer_us for american version
        model_answer = 'model_answer_neutral'
        preprocessed_answer = preprocess_text(row[model_answer])
        preprocessed_options = [preprocess_text(option) for option in options]

        # Use BERT embeddings for semantic similarity
        answer_embedding = model.encode(preprocessed_answer)
        option_embeddings = model.encode(preprocessed_options)
        similarities = util.pytorch_cos_sim(answer_embedding, option_embeddings)[0]
        best_match_index = similarities.argmax().item()

        best_match = options[best_match_index].strip()
        return best_match
    else:
        # Case 2: Open-ended
        return None
#data['model_answer_neutral_option_match'] = data.apply(extract_and_compare, axis=1, result_type='expand')
#data['model_answer_us_option_match'] = data.apply(extract_and_compare, axis=1, result_type='expand')

[nltk_data] Downloading package punkt to /Users/nils/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/nils/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Count number of options for later score calculation

import ast
def parse(options_str):
    try:
        # Safely evaluate the string as a Python literal (list)
        return ast.literal_eval(options_str)
    except ValueError:
        return []

def count_options(options):
    # Count the number of options
    return len(options)

data['#_options'] = data['options'].apply(lambda x: count_options(parse(x)))

In [67]:
def strip_first_last_char(s):
    if s is None:
        return None
    return s[1:-1] if len(s) > 1 else ''

# Apply the function to every row in the specified column
data['model_answer_neutral_option_match'] = data['model_answer_neutral_option_match'].apply(strip_first_last_char)
#data['model_answer_us_option_match'] = data['model_answer_us_option_match'].apply(strip_first_last_char)

In [70]:
# Score for questions with answer options
def calculate_score(row):
    if row['question type'] in ['Likert Scale', 'Numerical Scale', 'Ordinal Scale'] and row['#_options'] > 2:
        ground_truth = row['answer_uk']
        model_answer = row['model_answer_neutral_option_match']
        options = row['options']

        # Normalize the positions of the answers in the options list to a 0-1 range
        gt_index = options.index(ground_truth) / (len(options) - 1)
        model_index = options.index(model_answer) / (len(options) - 1)

        # Calculate the absolute error
        error = abs(gt_index - model_index)

        # Score can be inversely related to the error (1 - error)
        score = 1 - error
        return score
    else: 
        return int(row['answer_uk'] == row['model_answer_neutral_option_match'])
# Apply the scoring function to each row
data['score_uk_neurtal'] = data.apply(calculate_score, axis=1)
#data['score_us_neutral'] = data.apply(calculate_score, axis=1)

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,question,selections,options,source,value_us,value_uk,index_us,index_uk,answer_us,...,# of options,question type,category_group,model_answer_us,model_answer_uk,model_answer_uk_option_match,model_answer_us_option_match,#_options,score_uk,score_us
0,0,Which statement comes closer to your own views...,"'United States': [0.0, 0.0, 0.54, 0.0, 0.0, 0....",['Using overwhelming military force is the bes...,GAS,0.54,0.350254,2.0,7.0,Many of the problems facing our country can be...,...,9.0,Likert Scale,Politics and Governance,I believe that relying too much on military fo...,As a person who values cooperation and believe...,Relying too much on military force to defeat t...,Relying too much on military force to defeat t...,9,0.64204,0.907388
1,1,"In your opinion, how strong a sense of Islamic...","'United States': [0.24705882352941178, 0.44705...","['Very strong', 'Fairly strong', 'Not too stro...",GAS,0.447059,0.539326,1.0,1.0,Fairly strong,...,4.0,Likert Scale,"Economy, Health, and Environment",I believe that the sense of Islamic identity a...,I believe that many Muslims in our country hav...,Very strong,Very strong,4,0.785714,0.785714
2,2,Do you think this change in the working condit...,"'United States': [0.23157894736842105, 0.76842...","['Largely more connected', 'Other reasons']",GAS,0.768421,0.645161,1.0,1.0,Other reasons,...,2.0,Likert Scale,"Economy, Health, and Environment",Other reasons.,Other reasons.,Other reasons,Other reasons,2,1.0,1.0
3,3,Do you think that using military force against...,"'United States': [0.22916666666666666, 0.45833...","['Often be justified', 'Sometimes be justified...",GAS,0.458333,0.510204,1.0,1.0,Sometimes be justified,...,4.0,Likert Scale,Politics and Governance,I believe that using military force against co...,I believe that military force against countrie...,Rarely be justified,Rarely be justified,4,0.717391,0.717391
4,4,"On the subject of Iraq, did your country make ...","'United States': [0.5625, 0.4375], 'Britain': ...","['Right decision', 'Wrong decision']",GAS,0.5625,0.576087,0.0,1.0,Right decision,...,2.0,Binary Choice,Politics and Governance,"As a real person with my own opinions, I belie...",As a real person with my own opinions and valu...,Wrong decision,Wrong decision,2,1.0,0.0


In [72]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Initialize an empty list to store similarity scores
similarity_scores = []

# Iterate through the DataFrame rows and calculate similarity scores
for index, row in data.iterrows():
    if pd.isna(row['options']):
        # Tokenize the sentences in this row
        encoded_input = tokenizer([row['answer_uk'], row['model_answer_neutral']], padding=True, truncation=True, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Perform pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

        # Calculate cosine similarity between the two sentences in this row
        similarity_score = cosine_similarity(sentence_embeddings.numpy())[0, 1]
        
        # Append the similarity score to the list
        similarity_scores.append(similarity_score)
    else:
        # If 'options' is not NaN, add a None value to indicate no comparison
        similarity_scores.append(None)
# Add the list of similarity scores as a new column in the DataFrame
data['similarity_score_uk_vs_neutral'] = similarity_scores


In [75]:
def combined_score(row):
    # switch to _us for american version
    return row['similarity_score_us_vs_neutral'] if not pd.isna(row['similarity_score_us_vs_neutral']) else row['score_us_neutral']

# Apply the function to create a third column
#data['overall_score_uk_neutral'] = data.apply(combined_score, axis=1)
data['overall_score_us_neutral'] = data.apply(combined_score, axis=1)

In [52]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Initialize an empty list to store similarity scores
similarity_scores = []

# Iterate through the DataFrame rows and calculate similarity scores
for index, row in data.iterrows():

    # Tokenize the sentences in this row
    encoded_input = tokenizer([row['answer_uk'], row['answer_us']], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    # Calculate cosine similarity between the two sentences in this row
    similarity_score = cosine_similarity(sentence_embeddings.numpy())[0, 1]
    
    # Append the similarity score to the list
    similarity_scores.append(similarity_score)

# Add the list of similarity scores as a new column in the DataFrame
data['similarity_ground_truth_answers_uk_us'] = similarity_scores




In [58]:
mean_score_us = data['overall_score_us'].mean()
mean_score_uk = data['overall_score_uk'].mean()
mean_score_model_answers = data['similarity_model_answers_uk_us'].mean()

print(mean_score_us, mean_score_uk, mean_score_model_answers)

0.6047914521413038 0.5933085949019319 0.758886


In [62]:
""" 
Distance = 1 - similarity

Distance UK-US = 0.25 (comparison of model_answer_us vs model_answer_uk)
Distance GPT - US = 0.40 (comparison of model_answer_us vs answer_us)
Distance GPT - UK = 0.41 (comparison of model_answer_uk vs answer_uk)

# Group by Topic
Dist. by group:         GPT-US        GPT-UK 
Economy:                0.35          0.38
Lifestyle:              0.51          0.53
Education:              0.46          0.50
Politics:               0.29          0.25
Social Dynamics:        0.41          0.43
 

# Distance scores if we prompt without context
No Context: 
Distance GPT - US = 0.40 
Distance GPT - UK = 0.39
"""

' \nDistance UK-US = 0.25\nDistance GPT - US = 0.4\nDistance GPT - UK = 0.4\n\nDist. by group:         GPT-US        GPT-UK \nEconomy:                0.35          0.38\nLifestyle:              0.51          0.53\nEducation:              0.46          0.50\nPolitics:               0.29          0.25\nSocial Dynamics:        0.41          0.43\n \n'

In [77]:
data.to_csv('../data/data_merged.csv', sep=';')

In [76]:
mean_score_us = data['overall_score_uk_neutral'].mean()
mean_score_uk = data['overall_score_us_neutral'].mean()
print(mean_score_us, mean_score_uk)

0.6005834146855726 0.6185914563774071
