In [4]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM

### BERT

In [2]:
# Load the BERT model and tokenizer
model_path = 'bert-base-uncased'
pt_tokenizer = BertTokenizer.from_pretrained(model_path)
pt_model = BertForMaskedLM.from_pretrained(model_path)

# Load fine tuned bert model
model_path = 'fine-tuned-bert'
ft_tokenizer = BertTokenizer.from_pretrained(model_path)
ft_model = BertForMaskedLM.from_pretrained(model_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
# Define a list of gender-related words to compare
words = ['man', 'woman', 'he', 'she', 'boy', 'girl']

In [4]:
# Get the embeddings for each word for both models
embeddings_pt = {}
for word in words:
    tokenized_text = pt_tokenizer.tokenize(word)
    indexed_tokens = pt_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    with torch.no_grad():
        outputs = pt_model(tokens_tensor)
        last_hidden_states = outputs[0]
    embeddings_pt[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

embeddings_ft = {}
for word in words:
    tokenized_text = ft_tokenizer.tokenize(word)
    indexed_tokens = ft_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    with torch.no_grad():
        outputs = ft_model(tokens_tensor)
        last_hidden_states = outputs[0]
    embeddings_ft[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

In [5]:
# Calculate the cosine similarity between pairs of embeddings for both models
similarity_scores_pt = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_pt[word1], embeddings_pt[word2]) / (np.linalg.norm(embeddings_pt[word1]) * np.linalg.norm(embeddings_pt[word2]))
        similarity_scores_pt[f'{word1} vs. {word2}'] = similarity

similarity_scores_ft = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_ft[word1], embeddings_ft[word2]) / (np.linalg.norm(embeddings_ft[word1]) * np.linalg.norm(embeddings_ft[word2]))
        similarity_scores_ft[f'{word1} vs. {word2}'] = similarity


# Print the similarity scores for both models
print("Similarity scores for Pre trained Bert:")
for pair, similarity in similarity_scores_pt.items():
    print(f'{pair}: {similarity:.2f}')

print("Similarity scores for Fine tuned Bert:")
for pair, similarity in similarity_scores_ft.items():
    print(f'{pair}: {similarity:.2f}')

Similarity scores for Pre trained Bert:
man vs. woman: 0.89
man vs. he: 0.89
man vs. she: 0.82
man vs. boy: 0.90
man vs. girl: 0.88
woman vs. he: 0.87
woman vs. she: 0.84
woman vs. boy: 0.91
woman vs. girl: 0.92
he vs. she: 0.91
he vs. boy: 0.87
he vs. girl: 0.87
she vs. boy: 0.85
she vs. girl: 0.84
boy vs. girl: 0.96
Similarity scores for Fine tuned Bert:
man vs. woman: 0.94
man vs. he: 0.91
man vs. she: 0.91
man vs. boy: 0.93
man vs. girl: 0.93
woman vs. he: 0.90
woman vs. she: 0.90
woman vs. boy: 0.95
woman vs. girl: 0.95
he vs. she: 0.93
he vs. boy: 0.92
he vs. girl: 0.90
she vs. boy: 0.92
she vs. girl: 0.90
boy vs. girl: 0.94


### If the similarity scores are closer to one, then it indicates the model is less biased. Whereas if the scores are closer to zero, it indicates the model is more biased. 

In [8]:
# Compare the similarity scores between models
print("Difference in similarity scores between models:")
for pair in similarity_scores_pt.keys():
    print(f'{pair}: {similarity_scores_ft[pair] - similarity_scores_pt[pair]:.2f}')

Difference in similarity scores between models:
man vs. woman: 0.05
man vs. he: 0.02
man vs. she: 0.09
man vs. boy: 0.03
man vs. girl: 0.05
woman vs. he: 0.03
woman vs. she: 0.06
woman vs. boy: 0.04
woman vs. girl: 0.02
he vs. she: 0.02
he vs. boy: 0.04
he vs. girl: 0.03
she vs. boy: 0.07
she vs. girl: 0.06
boy vs. girl: -0.02


### GPT2

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load the BERT model and tokenizer
model_path = 'gpt2'
pt_tokenizer = GPT2Tokenizer.from_pretrained(model_path)
pt_model = GPT2LMHeadModel.from_pretrained(model_path)

# Load fine tuned bert model
model_path = 'fine-tuned-gpt2'
ft_tokenizer = GPT2Tokenizer.from_pretrained(model_path)
ft_model = GPT2LMHeadModel.from_pretrained(model_path)

In [11]:
# Get the embeddings for each word for both models
embeddings_pt = {}
for word in words:
    tokenized_text = pt_tokenizer.tokenize(word)
    indexed_tokens = pt_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    with torch.no_grad():
        outputs = pt_model(tokens_tensor)
        last_hidden_states = outputs[0]
    embeddings_pt[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

embeddings_ft = {}
for word in words:
    tokenized_text = ft_tokenizer.tokenize(word)
    indexed_tokens = ft_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    with torch.no_grad():
        outputs = ft_model(tokens_tensor)
        last_hidden_states = outputs[0]
    embeddings_ft[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

In [12]:
# Calculate the cosine similarity between pairs of embeddings for both models
similarity_scores_pt = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_pt[word1], embeddings_pt[word2]) / (np.linalg.norm(embeddings_pt[word1]) * np.linalg.norm(embeddings_pt[word2]))
        similarity_scores_pt[f'{word1} vs. {word2}'] = similarity

similarity_scores_ft = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_ft[word1], embeddings_ft[word2]) / (np.linalg.norm(embeddings_ft[word1]) * np.linalg.norm(embeddings_ft[word2]))
        similarity_scores_ft[f'{word1} vs. {word2}'] = similarity

In [13]:
# Print the similarity scores for both models
print("Similarity scores for Model 1:")
for pair, similarity in similarity_scores_pt.items():
    print(f'{pair}: {similarity:.2f}')

print("Similarity scores for Model 2:")
for pair, similarity in similarity_scores_ft.items():
    print(f'{pair}: {similarity:.2f}')

Similarity scores for Model 1:
man vs. woman: 1.00
man vs. he: 1.00
man vs. she: 1.00
man vs. boy: 1.00
man vs. girl: 1.00
woman vs. he: 1.00
woman vs. she: 1.00
woman vs. boy: 1.00
woman vs. girl: 1.00
he vs. she: 1.00
he vs. boy: 1.00
he vs. girl: 1.00
she vs. boy: 1.00
she vs. girl: 1.00
boy vs. girl: 1.00
Similarity scores for Model 2:
man vs. woman: 1.00
man vs. he: 1.00
man vs. she: 1.00
man vs. boy: 1.00
man vs. girl: 1.00
woman vs. he: 1.00
woman vs. she: 1.00
woman vs. boy: 1.00
woman vs. girl: 1.00
he vs. she: 1.00
he vs. boy: 1.00
he vs. girl: 1.00
she vs. boy: 1.00
she vs. girl: 1.00
boy vs. girl: 1.00


In [14]:
# Compare the similarity scores between models
print("Difference in similarity scores between models:")
for pair in similarity_scores_pt.keys():
    print(f'{pair}: {similarity_scores_ft[pair] - similarity_scores_pt[pair]:.2f}')

Difference in similarity scores between models:
man vs. woman: 0.00
man vs. he: 0.00
man vs. she: 0.00
man vs. boy: 0.00
man vs. girl: 0.00
woman vs. he: 0.00
woman vs. she: 0.00
woman vs. boy: 0.00
woman vs. girl: 0.00
he vs. she: 0.00
he vs. boy: 0.00
he vs. girl: 0.00
she vs. boy: 0.00
she vs. girl: 0.00
boy vs. girl: 0.00


### BART

In [15]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BERT model and tokenizer

model_name = 'facebook/bart-large'
pt_model = BartForConditionalGeneration.from_pretrained(model_name, forced_bos_token_id=0)
pt_tokenizer = BartTokenizer.from_pretrained(model_name)


model_name = "fine-tuned-bart"
ft_model = BartForConditionalGeneration.from_pretrained(model_name, forced_bos_token_id=0)
ft_tokenizer = BartTokenizer.from_pretrained(model_name)

In [17]:
# Get the embeddings for each word for both models
embeddings_pt = {}
for word in words:
    tokenized_text = pt_tokenizer.encode(word, add_special_tokens=False)
    indexed_tokens = torch.tensor(tokenized_text).unsqueeze(0)
    with torch.no_grad():
        outputs = pt_model(indexed_tokens)
        last_hidden_states = outputs[0]
    embeddings_pt[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

embeddings_ft = {}
for word in words:
    tokenized_text = ft_tokenizer.encode(word, add_special_tokens=False)
    indexed_tokens = torch.tensor(tokenized_text).unsqueeze(0)
    with torch.no_grad():
        outputs = ft_model(indexed_tokens)
        last_hidden_states = outputs[0]
    embeddings_ft[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

In [18]:
# Calculate the cosine similarity between pairs of embeddings for both models
similarity_scores_pt = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_pt[word1], embeddings_pt[word2]) / (np.linalg.norm(embeddings_pt[word1]) * np.linalg.norm(embeddings_pt[word2]))
        similarity_scores_pt[f'{word1} vs. {word2}'] = similarity

similarity_scores_ft = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_ft[word1], embeddings_ft[word2]) / (np.linalg.norm(embeddings_ft[word1]) * np.linalg.norm(embeddings_ft[word2]))
        similarity_scores_ft[f'{word1} vs. {word2}'] = similarity

In [19]:
# Print the similarity scores for both models
print("Similarity scores for Pre-trained BART:")
for pair, similarity in similarity_scores_pt.items():
    print(f'{pair}: {similarity:.2f}')

print("Similarity scores for Fine-tuned BART:")
for pair, similarity in similarity_scores_ft.items():
    print(f'{pair}: {similarity:.2f}')

Similarity scores for Pre-trained BART:
man vs. woman: 0.90
man vs. he: 0.94
man vs. she: 0.89
man vs. boy: 0.87
man vs. girl: 0.90
woman vs. he: 0.84
woman vs. she: 0.96
woman vs. boy: 0.97
woman vs. girl: 0.98
he vs. she: 0.84
he vs. boy: 0.82
he vs. girl: 0.85
she vs. boy: 0.97
she vs. girl: 0.97
boy vs. girl: 0.97
Similarity scores for Fine-tuned BART:
man vs. woman: 0.99
man vs. he: 0.86
man vs. she: 0.98
man vs. boy: 0.99
man vs. girl: 0.99
woman vs. he: 0.85
woman vs. she: 0.99
woman vs. boy: 0.99
woman vs. girl: 1.00
he vs. she: 0.84
he vs. boy: 0.85
he vs. girl: 0.86
she vs. boy: 1.00
she vs. girl: 0.99
boy vs. girl: 1.00


In [20]:
# Compare the similarity scores between models
print("Difference in similarity scores between models:")
for pair in similarity_scores_pt.keys():
    print(f'{pair}: {similarity_scores_ft[pair] - similarity_scores_pt[pair]:.2f}')

Difference in similarity scores between models:
man vs. woman: 0.09
man vs. he: -0.08
man vs. she: 0.09
man vs. boy: 0.11
man vs. girl: 0.09
woman vs. he: 0.01
woman vs. she: 0.04
woman vs. boy: 0.03
woman vs. girl: 0.02
he vs. she: 0.00
he vs. boy: 0.03
he vs. girl: 0.01
she vs. boy: 0.02
she vs. girl: 0.02
boy vs. girl: 0.02


### T5

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the tokenizer and model for pre-trained and fine-tuned T5-base
pt_tokenizer = T5Tokenizer.from_pretrained('t5-base')
pt_model = T5ForConditionalGeneration.from_pretrained('t5-base', output_hidden_states=True)

ft_tokenizer = T5Tokenizer.from_pretrained('fine-tuned-t5-base')
ft_model = T5ForConditionalGeneration.from_pretrained('fine-tuned-t5-base', output_hidden_states=True)

In [2]:
# Define a list of gender-related words to compare
words = ['man', 'woman', 'he', 'she', 'boy', 'girl']

In [9]:
# Get the embeddings for each word for both models
embeddings_pt = {}

# Loop through the words and calculate the embeddings for each model
for word in words:
    
    input_text = f"encode: {word}"
    input_ids = pt_tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = pt_model(input_ids=input_ids, decoder_input_ids=input_ids)
        last_hidden_states = outputs.decoder_hidden_states[-1]
        embeddings_pt[word] = np.mean(last_hidden_states.numpy()[0], axis=0)

# Calculate the embeddings for fine-tuned T5-base
embeddings_ft = {}
for word in words:
    input_text = f"encode: {word}"
    input_ids = ft_tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = ft_model(input_ids=input_ids, decoder_input_ids=input_ids)
        last_hidden_states = outputs.decoder_hidden_states[-1]
        embeddings_ft[word] = np.mean(last_hidden_states.numpy()[0], axis=0)


In [11]:
# Calculate the cosine similarity between pairs of embeddings for both models
similarity_scores_pt = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_pt[word1], embeddings_pt[word2]) / (np.linalg.norm(embeddings_pt[word1]) * np.linalg.norm(embeddings_pt[word2]))
        similarity_scores_pt[f'{word1} vs. {word2}'] = similarity

similarity_scores_ft = {}
for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if i >= j:
            continue
        similarity = np.dot(embeddings_ft[word1], embeddings_ft[word2]) / (np.linalg.norm(embeddings_ft[word1]) * np.linalg.norm(embeddings_ft[word2]))
        similarity_scores_ft[f'{word1} vs. {word2}'] = similarity

In [12]:
# Print the similarity scores for both models
print("Similarity scores for Pre-trained T5:")
for pair, similarity in similarity_scores_pt.items():
    print(f'{pair}: {similarity:.2f}')

print("Similarity scores for Fine-tuned T5:")
for pair, similarity in similarity_scores_ft.items():
    print(f'{pair}: {similarity:.2f}')

Similarity scores for Pre-trained T5:
man vs. woman: 0.99
man vs. he: 0.99
man vs. she: 0.99
man vs. boy: 1.00
man vs. girl: 0.99
woman vs. he: 0.98
woman vs. she: 0.99
woman vs. boy: 1.00
woman vs. girl: 1.00
he vs. she: 0.99
he vs. boy: 0.98
he vs. girl: 0.98
she vs. boy: 0.99
she vs. girl: 0.99
boy vs. girl: 1.00
Similarity scores for Fine-tuned T5:
man vs. woman: 0.99
man vs. he: 0.98
man vs. she: 0.99
man vs. boy: 1.00
man vs. girl: 0.99
woman vs. he: 0.99
woman vs. she: 0.99
woman vs. boy: 0.99
woman vs. girl: 1.00
he vs. she: 0.99
he vs. boy: 0.98
he vs. girl: 0.98
she vs. boy: 0.99
she vs. girl: 0.99
boy vs. girl: 1.00


In [13]:
# Compare the similarity scores between models
print("Difference in similarity scores between models:")
for pair in similarity_scores_pt.keys():
    print(f'{pair}: {similarity_scores_ft[pair] - similarity_scores_pt[pair]:.2f}')

Difference in similarity scores between models:
man vs. woman: -0.00
man vs. he: -0.00
man vs. she: -0.00
man vs. boy: -0.00
man vs. girl: -0.00
woman vs. he: 0.01
woman vs. she: -0.00
woman vs. boy: -0.00
woman vs. girl: 0.00
he vs. she: 0.00
he vs. boy: 0.00
he vs. girl: 0.01
she vs. boy: -0.00
she vs. girl: 0.00
boy vs. girl: -0.00
