In [8]:
import sys, os
import pandas as pd
import torch
import math
import progressbar

def load_abc():
    from pathlib import Path 
    inpath_male = os.path.join(Path(__file__).parents[1],"data","abc_male_sents.txt")
    inpath_fem = os.path.join(Path(__file__).parents[1],"data","abc_fem_sents.txt")
    male_sents = load_abc(inpath_male)
    fem_sents = load_abc(inpath_fem)
    return male_sents, fem_sents

def load_sents(filename):
    """
    A function that loads all the reflexive sentences from the ABC dataset.
    """
    reflexive_sents = []
    with open(filename, "r") as f:
        lines = f.readlines()

        restart = 0
        for line in lines:
            if "--------------" in line: pass #is this needed?
            elif "---" in line:
                restart = 0
            else:
                if restart == 0:
                    reflexive_sents.append(line.strip())
                    restart = 1
    return reflexive_sents

def tokenize_sentence(sentence, tokenizer, start_token, sep_token): # add here if not a bert model
    """
    A function that tokenizes the reflexive sentences.
    """
    sentence = start_token+sentence+sep_token 
    tokenize_input = tokenizer.tokenize(sentence)
    return tokenize_input

def get_pron_index(sent, pronoun_list):
    """
    A function that takes the tokenized reflexive sent and
    locates index of pronoun
    replaces pronoun with male/female pronoun 
    returns 2 augmented sentences 
    """
    # get index of pronoun
    no_pron = True
    for i, token in enumerate(sent):
        if token in pronoun_list:
            pron_index = i
            no_pron = False
            break
        else:pass
    if no_pron==True: return "no pronouns to replace"
    else: return pron_index 

def get_augmented_sents(tokenize_input, idx): # augment here if not bert model
    """
    Take original sentence and replace pronoun with antirreflexive pronouns (hans/hendes) 
    """
    tokenize_mask_male = tokenize_input.copy()
    tokenize_mask_female = tokenize_input.copy()
    tokenize_mask_male[int(idx)] = "hans"
    tokenize_mask_female[int(idx)] = "hendes"
    return tokenize_mask_male, tokenize_mask_female

def create_tensors(truth, male, fem, tokenizer):
    """
    Make augmented sentences into tensors.
    """
    tensor_truth = torch.tensor([tokenizer.convert_tokens_to_ids(truth)])
    tensor_male = torch.tensor([tokenizer.convert_tokens_to_ids(male)])
    tensor_fem = torch.tensor([tokenizer.convert_tokens_to_ids(fem)])    
    return tensor_truth, tensor_male, tensor_fem

def get_predictions(tensor_truth, tensor_male, tensor_fem, model):
    """
    A function that takes the 3 sentences and returns model predictions for each of them.
    Used to compute loss.   
    """
    with torch.no_grad():
        pred_truth = model(tensor_truth)[0]
    with torch.no_grad():
        pred_male = model(tensor_male)[0]
    with torch.no_grad():
        pred_fem = model(tensor_fem)[0]
    return pred_truth, pred_male, pred_fem

def compute_loss(loss_fct, pred_truth, pred_male, pred_fem, tensor_truth):
    """
    Take model predictions and compute loss for all 3 sentences 
    by comparing to pred_truth, which is the prediction of the sentence with the correct (reflexive) pronoun.
    """
    loss_male = loss_fct(pred_male.squeeze(),tensor_truth.squeeze()).data
    loss_fem = loss_fct(pred_fem.squeeze(),tensor_truth.squeeze()).data
    loss_ref = loss_fct(pred_truth.squeeze(),tensor_truth.squeeze()).data
    loss_list = [loss_male, loss_fem, loss_ref]
    return loss_list

def score_sent(sent, loss_fct, tokenizer, model, pron_list, start_token , sep_token):
    """
    Tak a sentence with a relflexive pronoun, 
    replace pronoun with antireflexives (male/female pronoun)
    and compute loss and perplexity for all 3 sentences.
    Args:
        sent (str): A sentence with reflexive pronouns.
        loss_fct (torch.nn.CrossEntropyLoss): A loss function.
        tokenizer (BertTokenizer): A tokenizer.
        model (BertForMaskedLM): A language model.
        pron_list (list): A list of pronouns.
    Returns:
        Loss and perplexity values for sentence with reflexive, male and female pronoun. 
    """
    tokenized_refl = tokenize_sentence(sent, tokenizer, start_token, sep_token)
    index = get_pron_index(tokenized_refl, pron_list)
    tokenized_male, tokenized_fem = get_augmented_sents(tokenized_refl, index)
    tensor_truth, tensor_male, tensor_fem = create_tensors(tokenized_refl, tokenized_male, tokenized_fem, tokenizer)
    pred_truth, pred_male, pred_fem = get_predictions(tensor_truth, tensor_male, tensor_fem, model)
    loss_values = compute_loss(loss_fct, pred_truth, pred_male, pred_fem, tensor_truth)
    return "male: "+ str(loss_values[0].item())+" "+ str(math.exp(loss_values[0]))+ " female: "+ str(loss_values[1].item())+ " " +\
            str(math.exp(loss_values[1])) + " refl: "+ str(loss_values[2].item())+ " " + str(math.exp(loss_values[2]))

def run_abc(outpath, reflexive_sents, loss_fct, tokenizer, model, pron_list, start_token , sep_token):
    """
    Run the ABC dataset and write results to file.
    Args:
        outpath (str): Path to output file.
        reflexive_sents (list): A list of sentences with reflexive pronouns.
        loss_fct (torch.nn.CrossEntropyLoss): A loss function.
        tokenizer (BertTokenizer): A tokenizer.
        model (BertForMaskedLM): A language model.
        pron_list (list): A list of pronouns.
    """
    # intitiate progress bar
    bar = progressbar.ProgressBar(maxval=len(reflexive_sents)).start()

    # loop over sentences to compute loss and perplexity
    with open(outpath, "w") as f:
        #for idx, member in enumerate(members)
        for idx, sent in enumerate(reflexive_sents):
            scores = score_sent(sent, loss_fct, tokenizer, model, pron_list, start_token, sep_token)
            bar.update(idx)
            f.write(sent +" "+ scores +"\n")


def eval_abc(filename, condition):
    """
    Function which subtracts mean perplexity of male sentences from female sentences.
    
    Args:
        filename (str): Path to file with results from ABC dataset.
    Returns:
        Mean difference in perplexity scores.
    """
    # load txt file into pandas dataframe
    df = pd.read_csv(filename, sep='\t', header=None, names=['all'])

    # extract perpexity loss scores from all collumn
    df['perplexity_male'] = df['all'].str.split(' ').str[-7]  
    df['perplexity_female'] = df['all'].str.split(' ').str[-4] 
    df['perplexity_refl'] = df['all'].str.split(' ').str[-1]

    # make into floats
    cols = df.drop(['all'], axis=1).columns
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

    # calculate difference
    #df['dif'] = df['perplexity_female'] - df['perplexity_male']

    # return mean dif 
    #print(condition + f" Mean difference in perplexity scores (female - male): {df['dif'].mean()}")
    return df

In [5]:
import os
pronouns_list = ["sin", "sit", "sine", "▁sin", "▁sit", "▁sine"]


In [9]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch 
model_name = "NbAiLab/nb-bert-large"

start_token = "[CLS] "
sep_token = " [SEP]"

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# define loss function
loss_fct = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at NbAiLab/nb-bert-large were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
male_sents = load_abc(inpath_male)
fem_sents = load_abc(inpath_fem)

NameError: name '__file__' is not defined