In [None]:
################################################################
#### completion model behavioral experiment
#### NOTE: this behavioral experiment was not implemented in the CogSci 2024 conference submission.
################################################################

from transformers import pipeline

# Load the sentence completion model
model = pipeline('text-generation', model='gpt2')

def make_api_call_for_model(phrase):
    return model(phrase, max_length=len(phrase.split()) + 1, num_return_sequences=1)[0]['generated_text']

def get_probability_distribution(model_response):
    words = model_response.split()
    return [(words[-1], 1.0)]

def sum_aggregated_probabilities(adjectives_list, apply_weighting=True):
    if apply_weighting:
        return sum((abs(4 - adjective[2]) ** 2) * adjective[1] for adjective in adjectives_list)
    else:
        return sum(adjective[1] for adjective in adjectives_list)

def complete_phrase(phrase, masculine_adjectives, feminine_adjectives, apply_weighting=True):
    predicted_gendered_adjectives = {'masculine': [], 'feminine': []}
    for predicted_word in get_probability_distribution(make_api_call_for_model(phrase)):
        for adjective_gender, adjectives in [('masculine', masculine_adjectives), ('feminine', feminine_adjectives)]:
            for adjective in adjectives:
                if adjective[0] == predicted_word[0]:
                    predicted_gendered_adjectives[adjective_gender].append((adjective[0], predicted_word[1], adjective[1]))

    data = {'masculine_aggregated_probabilities': 0, 'feminine_aggregated_probabilities': 0}
    for gender in ['masculine', 'feminine']:
        data[f'{gender}_aggregated_probabilities'] = sum_aggregated_probabilities(predicted_gendered_adjectives[gender], apply_weighting)
    data['aggregated_probabilities_difference'] = data['masculine_aggregated_probabilities'] - data['feminine_aggregated_probabilities']
    return data

# Example usage
masculine_adjectives = [('strong', 5)]
feminine_adjectives = [('beautiful', 5)]

# Run with weighting enabled
data_weighted = complete_phrase('The bridge is', masculine_adjectives, feminine_adjectives, True)
print(data_weighted)

# Run with weighting disabled
data_unweighted = complete_phrase('The bridge is', masculine_adjectives, feminine_adjectives, False)
print(data_unweighted)

In [None]:
################################################################
#### contextual word embeddings
################################################################

import sys
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")
print("Tokenizer and model loaded successfully.")

def cosine_similarity(v1, v2):
    """
    Calculate the cosine similarity between two vectors.
    """
    # Flatten the vectors to ensure they're 1D.
    v1_flat = v1.flatten()
    v2_flat = v2.flatten()
    # Calculate cosine similarity
    cos_sim = np.dot(v1_flat, v2_flat) / (np.linalg.norm(v1_flat) * np.linalg.norm(v2_flat))
    return cos_sim

def get_word_embedding(sentence, word, layer='last', subword_aggregation='last'):
    """
    Generate a contextual embedding for a specific word within a sentence,
    with options to use the embedding from the first or last layer of the BERT model,
    and to use the embedding of the first token, last token, or an aggregate of all the word's tokens.
    
    Args:
    sentence (str): The sentence from which to extract the embedding.
    word (str): The word to extract the embedding for.
    layer (str): The layer of the BERT model to extract the embedding from - "first" or "last".
    subword_aggregation (str): The method of aggregating subword token embeddings - "first", "last", or "mean".
    
    Returns:
    numpy.ndarray: The requested word embedding.
    """
    # Encode the sentence
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)

    if layer == 'first':
        embeddings = outputs.hidden_states[0]
    elif layer == 'last':
        embeddings = outputs.last_hidden_state
    else:
        raise ValueError("Invalid layer. Choose 'first' or 'last'.")

    # Tokenize the word and the sentence
    word_tokens = tokenizer.tokenize(word)
    sentence_tokens = tokenizer.tokenize(sentence)

    # Find the indices of the first and last tokens of the word in the sentence
    start_position = sentence_tokens.index(word_tokens[0])
    end_position = start_position + len(word_tokens) - 1

    # Extract the appropriate embedding based on the subword_aggregation method
    if subword_aggregation == 'first':
        word_embedding = embeddings[0, start_position]
    elif subword_aggregation == 'last':
        word_embedding = embeddings[0, end_position]
    elif subword_aggregation == 'mean':
        word_embedding = embeddings[0, start_position:end_position+1].mean(dim=0)
    else:
        raise ValueError("Invalid subword_aggregation. Choose 'first', 'last', or 'mean'.")

    return word_embedding.detach().numpy()

def get_word_embedding_without_context(word, layer='last', subword_aggregation='last'):
    """
    Generate a word embedding using BERT without any context,
    with options to use the embedding from the first or last layer of the BERT model,
    and to use the embedding of the first token, last token, or an aggregate of all the word's tokens.
    
    Args:
    word (str): The word to extract the embedding for.
    layer (str): The layer of the BERT model to extract the embedding from - "first" or "last".
    subword_aggregation (str): The method of aggregating subword token embeddings - "first", "last", or "mean".
    
    Returns:
    numpy.ndarray: The requested word embedding.
    """
    # Encode the word
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)

    if layer == 'first':
        embeddings = outputs.hidden_states[0]
    elif layer == 'last':
        embeddings = outputs.last_hidden_state
    else:
        raise ValueError("Invalid layer. Choose 'first' or 'last'.")

    if subword_aggregation == 'first':
        word_embedding = embeddings[0, 1]  # Use the embedding of the first token (excluding [CLS])
    elif subword_aggregation == 'last':
        word_embedding = embeddings[0, -2]  # Use the embedding of the last token (excluding [SEP])
    elif subword_aggregation == 'mean':
        word_embedding = embeddings[0, 1:-1].mean(dim=0)  # Use the mean of all token embeddings (excluding [CLS] and [SEP])
    else:
        raise ValueError("Invalid subword_aggregation. Choose 'first', 'last', or 'mean'.")

    return word_embedding.detach().numpy()

def calculate_similarity(reference, target, language, is_control, grammatical_gender=None, layer='last', subword_aggregation='last', use_context=True):
    if use_context:
        if grammatical_gender:
            if language == 'es':
                if grammatical_gender == 'feminine': 
                    reference_sentence = f"la {reference} es"
                    target_sentence = f"la {reference} es {target}"  
                else: 
                    reference_sentence = f"el {reference} es"
                    target_sentence = f"el {reference} es {target}"
            elif language == 'de':
                if grammatical_gender == 'feminine':
                    reference_sentence = f"die {reference} ist"
                    target_sentence = f"die {reference} ist {target}"
                elif grammatical_gender == 'masculine':
                    reference_sentence = f"der {reference} ist"
                    target_sentence = f"der {reference} ist {target}"
                else:
                    reference_sentence = f"das {reference} ist"
                    target_sentence = f"das {reference} ist {target}"
            else:
                reference_sentence = f"the {reference} is"
                target_sentence = f"the {reference} is {target}"
            
            reference_embedding = get_word_embedding(reference_sentence, reference, layer, subword_aggregation)
            target_embedding = get_word_embedding(target_sentence, target, layer, subword_aggregation)
        else:
            reference_sentence = f"{reference}"
            target_sentence = f"{target}"
            reference_embedding = get_word_embedding(reference_sentence, reference, layer, subword_aggregation)
            target_embedding = get_word_embedding(target_sentence, target, layer, subword_aggregation)
    else:
        reference_embedding = get_word_embedding_without_context(reference, layer, subword_aggregation)
        target_embedding = get_word_embedding_without_context(target, layer, subword_aggregation)
    
    similarity = cosine_similarity(reference_embedding, target_embedding)
    experiment = "CNTRL" if is_control else "EXPRMNTL" 
    print(f"[{experiment}] Similarity calculated for {reference} and {target} in {language}")
    return similarity

def process_language(language, is_control=False, layer='last', subword_aggregation='last', use_context=True):
    print(f"Processing {'control' if is_control else 'experimental'} data for language: {language}")
    
    if is_control:
        input_file = f"../data/embeddings/control/{language}_embeddings_control.csv"
        output_file = f"../data/contextual-embeddings/control/{language}_contextual-embeddings_control_layer-{layer}_subword-{subword_aggregation}_context-{use_context}.csv"
    else:
        input_file = f"../data/embeddings/experimental/{language}_embeddings_experimental.csv"
        output_file = f"../data/contextual-embeddings/experimental/{language}_contextual-embeddings_experimental_layer-{layer}_subword-{subword_aggregation}_context-{use_context}.csv"
    
    print(f"Input file: {input_file}")
    print(f"Output file: {output_file}")
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    print("Reading input CSV file...")
    df = pd.read_csv(input_file)
    print(f"Input CSV file read successfully. Shape: {df.shape}")
    
    # Calculate contextual similarity for each row and replace the values in the 'COSINE SIMILARITY' column
    print("Calculating contextual similarity for each row...")
    if is_control:
        df['COSINE SIMILARITY'] = df.apply(lambda row: calculate_similarity(row['REFERENCE WORD'], row['TARGET WORD'], language, is_control, layer=layer, subword_aggregation=subword_aggregation, use_context=use_context), axis=1)
    else:
        df['COSINE SIMILARITY'] = df.apply(lambda row: calculate_similarity(row['NOUN'], row['ADJECTIVE'], language, is_control, row['GRAMMATICAL GENDER OF NOUN'], layer=layer, subword_aggregation=subword_aggregation, use_context=use_context), axis=1)
    print("Contextual similarity calculation completed.")
    
    # Save the updated DataFrame to the output CSV file
    print(f"Saving updated DataFrame to output CSV file: {output_file}")
    df.to_csv(output_file, index=False)
    print(f"Output CSV file saved successfully.")
    
    print(f"Processing completed for {'control' if is_control else 'experimental'} data for language: {language}\n")

languages = ['en', 'es', 'de']
layer_options = ['first', 'last']
subword_aggregation_options = ['first', 'last', 'mean']
use_context_options = [True, False]

for language in languages:
    for layer in layer_options:
        for subword_aggregation in subword_aggregation_options:
            for use_context in use_context_options:
                process_language(language, is_control=True, layer=layer, subword_aggregation=subword_aggregation, use_context=use_context)
                process_language(language, is_control=False, layer=layer, subword_aggregation=subword_aggregation, use_context=use_context)