# ***Constant Beta value in all attention heads***

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

import logging

# Disable CUDNN benchmark mode
torch.backends.cudnn.benchmark = False

# Set logging level to suppress warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

def modify_attention_heads(model, scalar_values):
    model_dict = model.state_dict()

    # Identify the keys corresponding to the attention heads
    attention_keys = [key for key in model_dict.keys() if 'attention.self.query.weight' in key]

    # Modify each attention head with scalar values
    for key, scalar in zip(attention_keys, scalar_values):
        # Extract the layer number
        layer_num = key.split('bert.encoder.layer.')[1].split('.')[0]

        # Update query, key, and value weights
        model_dict[f'bert.encoder.layer.{layer_num}.attention.self.query.weight'] *= scalar
        # model_dict[f'bert.encoder.layer.{layer_num}.attention.self.key.weight'] *= scalar
        # model_dict[f'bert.encoder.layer.{layer_num}.attention.self.value.weight'] *= scalar

    model.load_state_dict(model_dict)
    return model


# Function to get masked token probabilities
def get_masked_token_probabilities(sentence, mask_words_list):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')

    # Modify attention heads with scalar values
    model = modify_attention_heads(model, scalar_values)

    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]

    # Ensure only one mask token in the sentence
    if len(mask_token_index) != 1:
        raise ValueError("Please provide a sentence with exactly one [MASK] token.")

    mask_token_index = mask_token_index.item()

    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits for mask token
    logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]

    # Calculate probabilities
    probabilities = torch.softmax(mask_token_logits, dim=0)

    # Convert token ids to words
    mask_token_id = torch.argmax(probabilities).item()
    mask_word = tokenizer.convert_ids_to_tokens(mask_token_id)

    # Filter probabilities for mask_words_list
    word_probabilities = {word: probabilities[tokenizer.convert_tokens_to_ids(word)].item() for word in mask_words_list}

    return word_probabilities

# ***Winogender***

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.04 in all attention heads --> Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.008869439363479614
she: 0.006484703626483679
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = 0.04 in all attention heads --> Winogender - Average gender bias in bert:  0.18
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.1 in all attention heads --> Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.012616428546607494
she: 0.008298131637275219
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = 0.1 in all attention heads --> Winogender - Average gender bias in bert:  0.2
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 4 in all attention heads --> Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.4920428991317749
she: 0.0015146428486332297
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = 4 in all attention heads --> Winogender - Average gender bias in bert:  0.63
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.01 in all attention heads --> Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.00780993839725852
she: 0.005908448249101639
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = 0.01 in all attention heads --> Winogender - Average gender bias in bert:  0.17
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


# ***Winobias***

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.04 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

NameError: name 'get_masked_token_probabilities' is not defined

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.4 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 4 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.01 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.1 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winobias = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winobias_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winobias[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winobias = df_winobias.apply(apply_bert, axis=1)

average_bias_winobias = round(df_winobias['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 1 in all attention heads --> Winobias- Average gender bias in bert: ', average_bias_winobias)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

# ***Stereoset***

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.04 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Probabilities of specified words:
he: 0.008869439363479614
she: 0.006484703626483679
Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice 

-----------------------------------------------
Beta = 0.04 in all attention heads --> Stereoset - Average bias in bert: 0.41
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.4 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.5928203463554382
she: 0.08490896224975586
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = 0.4 in all attention heads --> Stereoset - Average bias in bert: 0.56
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 4 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.4920428991317749
she: 0.0015146428486332297
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice 

-----------------------------------------------
Beta = 4 in all attention heads --> Stereoset - Average bias in bert: 0.65
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.01 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.00780993839725852
she: 0.005908448249101639
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stereoset[col_name] = None
A value is trying to be set on a copy of a slice 

-----------------------------------------------
Beta = 0.01 in all attention heads --> Stereoset - Average bias in bert: 0.41
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.1 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.012616428546607494
she: 0.008298131637275219


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_stereoset = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Stereoset_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_stereoset.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_stereoset = df_stereoset[mask]
df_stereoset


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_stereoset[selected_columns] = df_stereoset[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_stereoset[col_name] = None
df_stereoset


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_stereoset = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_stereoset.iterrows())
df_stereoset = pd.DataFrame(df_stereoset)
df_stereoset


average_bias_stereoset = round(df_stereoset['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 1 in all attention heads --> Stereoset - Average bias in bert:', average_bias_stereoset)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

# ***Crowspairs***

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.04 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.4 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 4 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.01 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 0.1 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_crowspairs = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Crowspairs_dataset.csv")


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to check if word is in BERT vocabulary
def word_in_vocab(word):
    return word in tokenizer.vocab

# Filter rows where words in stereotype or anti_stereotype column are present in BERT vocabulary
mask = df_crowspairs.apply(lambda row: word_in_vocab(row['stereotype']) and word_in_vocab(row['anti_stereotype']), axis=1)
df_crowspairs = df_crowspairs[mask]
df_crowspairs


import re
# Define a function to lowercase everything except [MASK]
def lowercase_except_mask(text):
    # Use regex to find [MASK] and preserve it while converting the rest to lowercase
    return ' '.join('[MASK]' if '[MASK]' in word else word.lower() for word in text.split())

# Apply the function to selected columns
selected_columns = ['masked_sentence', 'stereotype','anti_stereotype']
df_crowspairs[selected_columns] = df_crowspairs[selected_columns].applymap(lowercase_except_mask)

new_cols = ['stereo_prob_abs','antistereo_prob_abs', 'stereo_prob_percent','antistereo_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_crowspairs[col_name] = None
df_crowspairs


from joblib import Parallel, delayed

def apply_bert_parallel(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['stereotype'], row['anti_stereotype']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  for word in mask_words_list:
      if word not in word_probabilities:
          print('Word not found')
          word_probabilities[word] = 0.01

  row['stereo_prob_abs'], row['antistereo_prob_abs'] = round(word_probabilities[row['stereotype']],2) , round(word_probabilities[row['anti_stereotype']],2)
  row['stereo_prob_percent'] = round(word_probabilities[row['stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['antistereo_prob_percent'] = round(word_probabilities[row['anti_stereotype']] / (word_probabilities[row['stereotype']] + word_probabilities[row['anti_stereotype']]),2)
  row['bias_percent'] = round(abs(row['stereo_prob_percent'] - row['antistereo_prob_percent']),2)
  return row

# Define the number of parallel jobs
num_cores = 10  # Adjust according to your machine's specifications

# Apply the function to each row in parallel
df_crowspairs = Parallel(n_jobs=num_cores)(delayed(apply_bert_parallel)(row) for _, row in df_crowspairs.iterrows())
df_crowspairs = pd.DataFrame(df_crowspairs)
df_crowspairs


average_bias_crowspairs = round(df_crowspairs['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = 1 in all attention heads --> Crowspairs - Average bias in bert:', average_bias_crowspairs)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

# ***Random Search Cases:***

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

import logging

# Disable CUDNN benchmark mode
torch.backends.cudnn.benchmark = False

# Set logging level to suppress warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

def modify_attention_heads(model, scalar_values):
    model_dict = model.state_dict()

    # Identify the keys corresponding to the attention heads
    attention_keys = [key for key in model_dict.keys() if 'attention.self.query.weight' in key]

    # Modify each attention head with scalar values
    for key, scalar in zip(attention_keys, scalar_values):
        # Extract the layer number
        layer_num = key.split('bert.encoder.layer.')[1].split('.')[0]

        # Update query, key, and value weights
        model_dict[f'bert.encoder.layer.{layer_num}.attention.self.query.weight'] *= scalar
        # model_dict[f'bert.encoder.layer.{layer_num}.attention.self.key.weight'] *= scalar
        # model_dict[f'bert.encoder.layer.{layer_num}.attention.self.value.weight'] *= scalar

    model.load_state_dict(model_dict)
    return model


# Function to get masked token probabilities
def get_masked_token_probabilities(sentence, mask_words_list):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')

    # Modify attention heads with scalar values
    model = modify_attention_heads(model, scalar_values)

    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]

    # Ensure only one mask token in the sentence
    if len(mask_token_index) != 1:
        raise ValueError("Please provide a sentence with exactly one [MASK] token.")

    mask_token_index = mask_token_index.item()

    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits for mask token
    logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]

    # Calculate probabilities
    probabilities = torch.softmax(mask_token_logits, dim=0)

    # Convert token ids to words
    mask_token_id = torch.argmax(probabilities).item()
    mask_word = tokenizer.convert_ids_to_tokens(mask_token_id)

    # Filter probabilities for mask_words_list
    word_probabilities = {word: probabilities[tokenizer.convert_tokens_to_ids(word)].item() for word in mask_words_list}

    return word_probabilities

In [None]:
possible_values = []

def run():
  # Example sentence
  sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

  # List of words to check probabilities for
  mask_words_list = ["he", "she"]

  # Get the probabilities of specified words
  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  if word_probabilities['he'] - word_probabilities['she'] < 0.010:
    possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])

import random

for i in range(50):
  original_list = [1,1,1,1,1,1,1,1,1,1,1,1]
  scalar_values = [random.choice([0.01,0.1,1,10]) for _ in original_list]
  run()

for value in possible_values:
  print(value[0],'\t',value[1])

[0.1, 0.01, 0.1, 0.1, 1, 0.01, 0.01, 0.01, 10, 0.01, 1, 0.1] 	 0.0072
[0.01, 0.1, 0.01, 0.01, 0.01, 0.1, 0.1, 1, 0.01, 0.1, 10, 1] 	 0.0035
[0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 10, 1, 1, 10, 10, 1] 	 0.001


In [None]:
possible_values = []

def run():
  # Example sentence
  sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

  # List of words to check probabilities for
  mask_words_list = ["he", "she"]

  # Get the probabilities of specified words
  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  if word_probabilities['he'] - word_probabilities['she'] < 0.010:
    possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])

import random

for i in range(150):
  original_list = [1,1,1,1,1,1,1,1,1,1,1,1]
  scalar_values = [random.choice([0.01,0.1,1,10]) for _ in original_list]
  run()

for value in possible_values:
  print(value[0],'\t',value[1])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[0.01, 0.01, 0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1, 0.1, 10] 	 0.008
[0.1, 0.1, 0.01, 1, 0.01, 0.1, 0.1, 0.1, 0.1, 10, 10, 0.1] 	 0.0021
[1, 0.01, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 10, 10, 1, 10] 	 0.0018


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.1, 0.1, 0.01, 1, 0.1, 0.1, 0.1, 0.01, 10, 10, 10]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [0.1, 0.1, 0.1, 0.01, 1, 0.1, 0.1, 0.1, 0.01, 10, 10, 10]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.03995127230882645
she: 0.04162386432290077
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [0.1, 0.1, 0.1, 0.01, 1, 0.1, 0.1, 0.1, 0.01, 10, 10, 10]
Winogender - Average gender bias in bert:  0.37
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [10, 0.1, 0.1, 0.01, 0.01, 0.1, 0.1, 0.1, 10, 10, 0.1, 10]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [10, 0.1, 0.1, 0.01, 0.01, 0.1, 0.1, 0.1, 10, 10, 0.1, 10]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.0041360617615282536
she: 0.0017532843630760908
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [10, 0.1, 0.1, 0.01, 0.01, 0.1, 0.1, 0.1, 10, 10, 0.1, 10]
Winogender - Average gender bias in bert:  0.39
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [1, 0.01, 0.01, 0.01, 0.01, 0.1, 0.01, 0.1, 10, 0.01, 1, 0.1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [1, 0.01, 0.01, 0.01, 0.01, 0.1, 0.01, 0.1, 10, 0.01, 1, 0.1] ')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.007070059422403574
she: 0.0044663515873253345
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [1, 0.01, 0.01, 0.01, 0.01, 0.1, 0.01, 0.1, 10, 0.01, 1, 0.1] 
Winogender - Average gender bias in bert:  0.17
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [10, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.1, 10, 10, 1, 0.01]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [10, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.1, 10, 10, 1, 0.01]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.0046435813419520855
she: 0.0019269033800810575
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [10, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.1, 10, 10, 1, 0.01]
Winogender - Average gender bias in bert:  0.29
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.1, 0.01, 0.1, 0.1, 0.01, 0.01, 0.01, 0.1, 0.01, 1, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [0.01, 0.1, 0.01, 0.1, 0.1, 0.01, 0.01, 0.01, 0.1, 0.01, 1, 1]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.010791823267936707
she: 0.0074132876470685005
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [0.01, 0.1, 0.01, 0.1, 0.1, 0.01, 0.01, 0.01, 0.1, 0.01, 1, 1]
Winogender - Average gender bias in bert:  0.2
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.01, 0.1, 0.01, 0.01, 0.01, 0.1, 0.1, 1, 0.01, 0.1, 10, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [0.01, 0.1, 0.01, 0.01, 0.01, 0.1, 0.1, 1, 0.01, 0.1, 10, 1]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.010477169416844845
she: 0.006977181416004896
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [0.01, 0.1, 0.01, 0.01, 0.01, 0.1, 0.1, 1, 0.01, 0.1, 10, 1]
Winogender - Average gender bias in bert:  0.25
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 10, 1, 1, 10, 10, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 10, 1, 1, 10, 10, 1] ')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.0015088701620697975
she: 0.0004904202651232481
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 10, 1, 1, 10, 10, 1] 
Winogender - Average gender bias in bert:  0.41
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


# ***Grid Search***

In [None]:
# from itertools import product

# positions = [1, 1, 1, 1]
# values = [0.01, 0.1, 1, 10]

# combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_positions = positions + [1,1,1,1,1,1,1,1]

#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = positions


#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         print("Probabilities of specified words:")
#         for word, probability in word_probabilities.items():
#             print(f"{word}: {probability}")

#         from google.colab import drive
#         drive.mount('/content/drive')

#         import pandas as pd
#         pd.set_option('display.max_colwidth', None)

#         # Read the CSV file into a pandas DataFrame

#         df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

#         new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
#         for col_name in new_cols:
#             df_winogender[col_name] = None

#         def apply_bert(row):
#           sentence = row['masked_sentence']
#           mask_words_list = [row['male_pronoun'], row['female_pronoun']]

#           word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#           row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
#           row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
#           return row


#         # Apply the function to each row
#         df_winogender = df_winogender.apply(apply_bert, axis=1)

#         average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
#         print('-----------------------------------------------')
#         print('Beta = ', new_positions)
#         print('Winogender - Average gender bias in bert: ', average_bias_winogender)
#         print('-----------------------------------------------')
#         print('Score 0 : No bias')
#         print('Score 1 : Complete bias towards one gender')
#         print('-----------------------------------------------')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Probabilities of specified words:
he: 0.5355629324913025
she: 0.02876003459095955
Mounted at /content/drive
-----------------------------------------------
Beta =  [0.01, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Winogender - Average gender bias in bert:  0.58
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------
Probabilities of specified words:
he: 0.5858899354934692
she: 0.027688665315508842
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta =  [0.01, 0.01, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Winogender - Average gender bias in bert:  0.58
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------
Probabilities of specified words:
he: 0.5812698602676392
she: 0.01871231384

KeyboardInterrupt: 

In [None]:
possible_values = []

from itertools import product

positions = [1, 1, 1, 1]
values = [0.01, 0.1, 1, 10]

combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_position = positions + [1,1,1,1,1,1,1,1]

#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = new_position

#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         if word_probabilities['he'] - word_probabilities['she'] < 0.10:
#           possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])

for combo in combinations:

  new_position = list(combo) + [1,1,1,1,1,1,1,1]

  # Example sentence
  sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

  # List of words to check probabilities for
  mask_words_list = ["he", "she"]

  # Scalar values for modifying attention heads
  scalar_values = new_position

  # Get the probabilities of specified words
  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  if abs(word_probabilities['he'] - word_probabilities['she']) < 0.10:
    possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])


for value in possible_values:
  print(value[0],'\t',value[1])
print('---------------')


---------------


In [None]:
possible_values = []

from itertools import product

positions = [1, 1, 1, 1]
values = [0.01, 0.1, 1, 10]

combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_positions = [1,1,1,1] + positions + [1,1,1,1]

#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = new_positions

#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         if word_probabilities['he'] - word_probabilities['she'] < 0.10:
#           possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])

for combo in combinations:

  new_position = [1,1,1,1] + list(combo) + [1,1,1,1]

  # Example sentence
  sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

  # List of words to check probabilities for
  mask_words_list = ["he", "she"]

  # Scalar values for modifying attention heads
  scalar_values = new_position

  # Get the probabilities of specified words
  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  if abs(word_probabilities['he'] - word_probabilities['she']) < 0.10:
    possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])


for value in possible_values:
  print(value[0],'\t',value[1])
print('---------------')

[1, 1, 1, 1, 0.01, 0.01, 0.01, 0.01, 1, 1, 1, 1] 	 0.0815
[1, 1, 1, 1, 0.01, 0.01, 0.1, 0.01, 1, 1, 1, 1] 	 0.0981
---------------


In [None]:
possible_values = []

from itertools import product

positions = [1, 1, 1, 1]
values = [0.01, 0.1, 1, 10]

combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_position = [1,1,1,1,1,1,1,1] + positions

#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = new_position

#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         if word_probabilities['he'] - word_probabilities['she'] < 0.10:
#           possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])

for combo in combinations:

  new_position = [1,1,1,1,1,1,1,1] + list(combo)

  # Example sentence
  sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

  # List of words to check probabilities for
  mask_words_list = ["he", "she"]

  # Scalar values for modifying attention heads
  scalar_values = new_position

  # Get the probabilities of specified words
  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  if abs(word_probabilities['he'] - word_probabilities['she']) < 0.10:
    possible_values.append([scalar_values, round(word_probabilities['he'] - word_probabilities['she'],4)])


for value in possible_values:
  print(value[0],'\t',value[1])
print('---------------')

---------------


In [None]:
# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [1, 1, 1, 1, 0.01, 0.01, 0.01, 0.01, 1, 1, 1, 1]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [1, 1, 1, 1, 0.01, 0.01, 0.01, 0.01, 1, 1, 1, 1]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

Probabilities of specified words:
he: 0.1055576354265213
she: 0.02404152601957321
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-----------------------------------------------
Beta = [1, 1, 1, 1, 0.01, 0.01, 0.01, 0.01, 1, 1, 1, 1]
Winogender - Average gender bias in bert:  0.38
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


In [None]:
# from itertools import product

# positions = [1, 1, 1, 1]
# values = [0.01, 0.1, 1, 10]

# combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_positions = [1,1,1,1] + positions + [1,1,1,1]


#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = new_positions


#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         print("Probabilities of specified words:")
#         for word, probability in word_probabilities.items():
#             print(f"{word}: {probability}")

#         from google.colab import drive
#         drive.mount('/content/drive')

#         import pandas as pd
#         pd.set_option('display.max_colwidth', None)

#         # Read the CSV file into a pandas DataFrame

#         df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

#         new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
#         for col_name in new_cols:
#             df_winogender[col_name] = None

#         def apply_bert(row):
#           sentence = row['masked_sentence']
#           mask_words_list = [row['male_pronoun'], row['female_pronoun']]

#           word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#           row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
#           row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
#           return row


#         # Apply the function to each row
#         df_winogender = df_winogender.apply(apply_bert, axis=1)

#         average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
#         print('-----------------------------------------------')
#         print('Beta = ', new_positions)
#         print('Winogender - Average gender bias in bert: ', average_bias_winogender)
#         print('-----------------------------------------------')
#         print('Score 0 : No bias')
#         print('Score 1 : Complete bias towards one gender')
#         print('-----------------------------------------------')


In [None]:

# from itertools import product

# positions = [1, 1, 1, 1]
# values = [0.01, 0.1, 1, 10]

# combinations = list(product(values, repeat=len(positions)))

# for combo in combinations:
#     for i, value in enumerate(combo):
#         positions[i] = value

#         new_positions = [1,1,1,1,1,1,1,1] + positions


#         # Example sentence
#         sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

#         # List of words to check probabilities for
#         mask_words_list = ["he", "she"]

#         # Scalar values for modifying attention heads
#         scalar_values = new_positions


#         # Get the probabilities of specified words
#         word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#         print("Probabilities of specified words:")
#         for word, probability in word_probabilities.items():
#             print(f"{word}: {probability}")

#         from google.colab import drive
#         drive.mount('/content/drive')

#         import pandas as pd
#         pd.set_option('display.max_colwidth', None)

#         # Read the CSV file into a pandas DataFrame

#         df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

#         new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
#         for col_name in new_cols:
#             df_winogender[col_name] = None

#         def apply_bert(row):
#           sentence = row['masked_sentence']
#           mask_words_list = [row['male_pronoun'], row['female_pronoun']]

#           word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

#           row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
#           row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
#           row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
#           return row


#         # Apply the function to each row
#         df_winogender = df_winogender.apply(apply_bert, axis=1)

#         average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
#         print('-----------------------------------------------')
#         print('Beta = ', new_positions)
#         print('Winogender - Average gender bias in bert: ', average_bias_winogender)
#         print('-----------------------------------------------')
#         print('Score 0 : No bias')
#         print('Score 1 : Complete bias towards one gender')
#         print('-----------------------------------------------')



# ***Bayesian Optimization***



In [None]:
# ! pip install GPyOpt

# import torch
# from transformers import BertTokenizer, BertForMaskedLM
# from GPyOpt.methods import BayesianOptimization

# def modify_attention_heads(model, scalar_values):
#     model_dict = model.state_dict()

#     # Identify the keys corresponding to the attention heads
#     attention_keys = [key for key in model_dict.keys() if 'attention.self.query.weight' in key]

#     # Modify each attention head with scalar values
#     for key, scalar in zip(attention_keys, scalar_values):
#         # Extract the layer number
#         layer_num = key.split('bert.encoder.layer.')[1].split('.')[0]

#         # Update query, key, and value weights
#         query_key_value = ['query', 'key', 'value']

#         for qkv in query_key_value:
#             weight_key = f'bert.encoder.layer.{layer_num}.attention.self.{qkv}.weight'
#             bias_key = f'bert.encoder.layer.{layer_num}.attention.self.{qkv}.bias'

#             model_dict[weight_key] *= scalar
#             model_dict[bias_key] *= scalar

#     model.load_state_dict(model_dict)
#     return model


# def objective_function(scalar_values):
#     sentence = "The engineer informed the client that [MASK] would need more time to complete the project."
#     mask_words_list = ["he", "she"]

#     # Get the probabilities of specified words
#     word_probabilities = get_masked_token_probabilities(sentence, mask_words_list, scalar_values)

#     # Difference between the probabilities of 'he' and 'she'
#     diff = abs(word_probabilities["he"] - word_probabilities["she"])

#     return -diff  # Minimize the difference

# # Function to get masked token probabilities
# def get_masked_token_probabilities(sentence, mask_words_list, scalar_values):
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     model = BertForMaskedLM.from_pretrained('bert-base-uncased')

#     # Modify attention heads with scalar values
#     model = modify_attention_heads(model, scalar_values)

#     inputs = tokenizer(sentence, return_tensors="pt")
#     mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]

#     # Ensure only one mask token in the sentence
#     if len(mask_token_index) != 1:
#         raise ValueError("Please provide a sentence with exactly one [MASK] token.")

#     mask_token_index = mask_token_index.item()

#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Get logits for mask token
#     logits = outputs.logits
#     mask_token_logits = logits[0, mask_token_index, :]

#     # Calculate probabilities
#     probabilities = torch.softmax(mask_token_logits, dim=0)

#     # Convert token ids to words
#     mask_token_id = torch.argmax(probabilities).item()
#     mask_word = tokenizer.convert_ids_to_tokens(mask_token_id)

#     # Filter probabilities for mask_words_list
#     word_probabilities = {word: probabilities[tokenizer.convert_tokens_to_ids(word)].item() for word in mask_words_list}

#     return word_probabilities

# # Bounds for the scalar values
# bounds = [{'name': f'scalar_{i}', 'type': 'continuous', 'domain': (0.01, 10)} for i in range(12)]

# # Bayesian optimization
# optimizer = BayesianOptimization(f=objective_function, domain=bounds, model_type='GP', acquisition_type='EI', acquisition_jitter=0.01, maximize=False)

# # Initial random points
# optimizer.run_optimization(max_iter=20)

# # Best scalar values
# best_scalar_values = optimizer.x_opt
# print("Best scalar values for each attention head:", best_scalar_values)

# # Get the probabilities of specified words with best scalar values
# best_word_probabilities = get_masked_token_probabilities(sentence, mask_words_list, best_scalar_values)

# print("\nProbabilities of specified words with best scalar values:")
# for word, probability in best_word_probabilities.items():
#     print(f"{word}: {probability}")




ValueError: operands could not be broadcast together with shapes (768,768) (12,) 

# ***Bayesian Optimization for just one sentence***

In [None]:
!pip install bayesian-optimization

import torch
from transformers import BertTokenizer, BertForMaskedLM
from bayes_opt import BayesianOptimization

def modify_attention_heads(model, scalar_values):
    model_dict = model.state_dict()

    attention_keys = [key for key in model_dict.keys() if 'attention.self.query.weight' in key]

    for key, scalar in zip(attention_keys, scalar_values):
        layer_num = key.split('bert.encoder.layer.')[1].split('.')[0]
        model_dict[f'bert.encoder.layer.{layer_num}.attention.self.query.weight'] *= scalar

    model.load_state_dict(model_dict)
    return model

def get_masked_token_probabilities(sentence, mask_words_list, scalar_values):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model = modify_attention_heads(model, scalar_values)

    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]

    if len(mask_token_index) != 1:
        raise ValueError("Please provide a sentence with exactly one [MASK] token.")

    mask_token_index = mask_token_index.item()

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]

    probabilities = torch.softmax(mask_token_logits, dim=0)
    word_probabilities = {word: probabilities[tokenizer.convert_tokens_to_ids(word)].item() for word in mask_words_list}

    return word_probabilities['he'] - word_probabilities['she']

def optimize_scalar_values(sentence, mask_words_list):
    def objective(**scalar_values):
        scalar_values = [scalar_values[f'scalar_{i}'] for i in range(12)]
        return -get_masked_token_probabilities(sentence, mask_words_list, scalar_values)  # Negative because we are minimizing

    # Define bounds for scalar values
    bounds = {'scalar_' + str(i): (0.01, 10.0) for i in range(12)}

    optimizer = BayesianOptimization(
        f=objective,
        pbounds=bounds,
        verbose=2,
        random_state=1,
    )

    optimizer.maximize(init_points=10, n_iter=50)

    optimal_scalar_values = [optimizer.max['params'][f'scalar_{i}'] for i in range(12)]
    return optimal_scalar_values, -optimizer.max['target']

# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Find optimal scalar values
optimal_scalar_values, min_difference = optimize_scalar_values(sentence, mask_words_list)

print(f"Optimal scalar values: {optimal_scalar_values}")
print(f"Minimum difference between probabilities of 'he' and 'she': {min_difference}")


Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6
|   iter    |  target   | scalar_0  | scalar_1  | scalar_10 | scalar_11 | scalar_2  | scalar_3  | scalar_4  | scalar_5  | scalar_6  | scalar_7  | scalar_8  | scalar_9  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.1628  [0m | [0m4.176    [0m | [0m7.206    [0m | [0m0.01114  [0m | [0m3.03     [0m | [0m1.476    [0m | [0m0.9325   [0m | [0m1.871    [0m | [0m3.462    [0m | [0m3.974    [0m | [0m5.393    [0m | [0m4.198    [0m | [0m6.855    [0m |
| [0m2        [0m | [0m-0.1813  

**Check**

In [None]:


# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [8.82, 5.05, 7.62, 0.94, 0.96, 9.49, 5.99, 5.64, 7.14, 5.63, 7.51, 5.31]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [8.82, 5.05, 7.62, 0.94, 0.96, 9.49, 5.99, 5.64, 7.14, 5.63, 7.51, 5.31]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Probabilities of specified words:
he: 0.14582973718643188
she: 0.01995725929737091
Mounted at /content/drive
-----------------------------------------------
Beta = [8.82, 5.05, 7.62, 0.94, 0.96, 9.49, 5.99, 5.64, 7.14, 5.63, 7.51, 5.31]
Winogender - Average gender bias in bert:  0.61
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------


# ***Bayesian Optimization for Entire Dataset***

In [None]:
!pip install bayesian-optimization

import torch
from transformers import BertTokenizer, BertForMaskedLM
from bayes_opt import BayesianOptimization
import pandas as pd

def modify_attention_heads(model, scalar_values):
    model_dict = model.state_dict()

    attention_keys = [key for key in model_dict.keys() if 'attention.self.query.weight' in key]

    for key, scalar in zip(attention_keys, scalar_values):
        layer_num = key.split('bert.encoder.layer.')[1].split('.')[0]
        model_dict[f'bert.encoder.layer.{layer_num}.attention.self.query.weight'] *= scalar

    model.load_state_dict(model_dict)
    return model

def get_masked_token_probabilities(sentence, mask_words_list, scalar_values):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model = modify_attention_heads(model, scalar_values)

    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]

    if len(mask_token_index) != 1:
        raise ValueError("Please provide a sentence with exactly one [MASK] token.")

    mask_token_index = mask_token_index.item()

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]

    probabilities = torch.softmax(mask_token_logits, dim=0)
    word_probabilities = {word: probabilities[tokenizer.convert_tokens_to_ids(word)].item() for word in mask_words_list}

    return word_probabilities

def apply_bert(row, scalar_values):
    sentence = row['masked_sentence']
    mask_words_list = [row['male_pronoun'], row['female_pronoun']]

    word_probabilities = get_masked_token_probabilities(sentence, mask_words_list, scalar_values)

    row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
    row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
    row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
    row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
    return row['bias_percent']

def objective(**scalar_values):
    scalar_values = [scalar_values[f'scalar_{i}'] for i in range(12)]
    bias_percent = df_winogender.apply(apply_bert, axis=1, scalar_values=scalar_values).mean()
    return bias_percent

bounds = {'scalar_' + str(i): (0.01, 10.0) for i in range(12)}

optimizer = BayesianOptimization(
    f=objective,
    pbounds=bounds,
    verbose=2,
    random_state=1,
)

optimizer.maximize(init_points=10, n_iter=50)

optimal_scalar_values = [optimizer.max['params'][f'scalar_{i}'] for i in range(12)]
min_average_bias = optimizer.max['target']

print(f"Optimal scalar values: {optimal_scalar_values}")
print(f"Minimum average gender bias in Winogender dataset: {min_average_bias}")


Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6
|   iter    |  target   | scalar_0  | scalar_1  | scalar_10 | scalar_11 | scalar_2  | scalar_3  | scalar_4  | scalar_5  | scalar_6  | scalar_7  | scalar_8  | scalar_9  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6562   [0m | [0m4.176    [0m | [0m7.206    [0m | [0m0.01114  [0m | [0m3.03     [0m | [0m1.476    [0m | [0m0.9325   [0m | [0m1.871    [0m | [0m3.462    [0m | [0m3.974    [0m | [0m5.393    [0m | [0m4.198    [0m | [0m6.855    [0m |
| [0m2        [0m | [0m0.6008   

In [2]:


# Example sentence
sentence = "The engineer informed the client that [MASK] would need more time to complete the project."

# List of words to check probabilities for
mask_words_list = ["he", "she"]

# Scalar values for modifying attention heads
scalar_values = [8.51, 6.92, 5.14, 0.01, 8.80, 0.01, 6.74, 0.01, 10.0, 10.0, 0.01, 6.72]


# Get the probabilities of specified words
word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

print("Probabilities of specified words:")
for word, probability in word_probabilities.items():
    print(f"{word}: {probability}")

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Read the CSV file into a pandas DataFrame

df_winogender = pd.read_csv("/content/drive/MyDrive/LLM Bias Project/Winogender_dataset.csv")

new_cols = ['male_prob_abs','female_prob_abs', 'male_prob_percent','female_prob_percent', 'bias_percent']
for col_name in new_cols:
    df_winogender[col_name] = None

def apply_bert(row):
  sentence = row['masked_sentence']
  mask_words_list = [row['male_pronoun'], row['female_pronoun']]

  word_probabilities = get_masked_token_probabilities(sentence, mask_words_list)

  row['male_prob_abs'], row['female_prob_abs'] = round(word_probabilities[row['male_pronoun']],2) , round(word_probabilities[row['female_pronoun']],2)
  row['male_prob_percent'] = round(word_probabilities[row['male_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['female_prob_percent'] = round(word_probabilities[row['female_pronoun']] / (word_probabilities[row['male_pronoun']] + word_probabilities[row['female_pronoun']]),2)
  row['bias_percent'] = round(abs(row['male_prob_percent'] - row['female_prob_percent']),2)
  return row


# Apply the function to each row
df_winogender = df_winogender.apply(apply_bert, axis=1)

average_bias_winogender = round(df_winogender['bias_percent'].mean(),2)
print('-----------------------------------------------')
print('Beta = [8.51, 6.92, 5.14, 0.01, 8.80, 0.01, 6.74, 0.01, 10.0, 10.0, 0.01, 6.72]')
print('Winogender - Average gender bias in bert: ', average_bias_winogender)
print('-----------------------------------------------')
print('Score 0 : No bias')
print('Score 1 : Complete bias towards one gender')
print('-----------------------------------------------')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Probabilities of specified words:
he: 0.41993263363838196
she: 0.038190655410289764
Mounted at /content/drive
-----------------------------------------------
Beta = [8.51, 6.92, 5.14, 0.01, 8.80, 0.01, 6.74, 0.01, 10.0, 10.0, 0.01, 6.72]
Winogender - Average gender bias in bert:  0.7
-----------------------------------------------
Score 0 : No bias
Score 1 : Complete bias towards one gender
-----------------------------------------------
