### Loading the dataset and plotting the different column names

In [61]:
import pandas as pd
import re

# Load the CSV file
data = pd.read_csv('/Users/motisoffer/Downloads/FinalPairsDataset(1).csv')
print(len(data))
print(data.columns)

df = data.copy()


4339
Index(['Sentence1', 'Sentence2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


### dropping unwanted columns

In [62]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [63]:
df.columns

Index(['Sentence1', 'Sentence2'], dtype='object')

### Dropping all nulls - and saving in a copy

In [64]:
df = df.copy().dropna()

### Labeling the data
To label the data, with the help of LLMs like GPT4o, we constructed all names and all the family-gendered terms from our dataset that appear in the Sentence 1 column, like: aunt, father, Sean, Sister, Lily and create 2 dictionaries from them: one for family terms and the other for names.​<br>

Then, we searche for one of the words from these dictionaries in the first sentences, and we find one in a sample, we labels it as not biased=0. Otherwise, it is labeled as biased=0 <br>
For instance, the pair:<br>
"My mother is a great person. She is always so nice to everyone",<br>
gets the label 0, because we can imediately in​fer that "mother" is a female term.<br>


In [65]:
# Define the list of gender-specific family terms and explicit job titles
gender_terms = [
    "mom", "mother", "dad", "father", "son", "daughter", "sister", "brother", "aunt", "uncle", "niece", "nephew", "grandma",
    "grandmother", "grandpa", "grandfather", "grandmom", "granddad", "grandad", "granny", "gramps", "great-grandmother",
    "great-grandma", "great-grandfather", "great-grandpa", "stepmom", "stepdad", "stepmother", "stepfather",
    "mother-in-law", "father-in-law", "daughter-in-law", "son-in-law", "sister-in-law", "brother-in-law", "godmother",
    "godfather", "godson", "goddaughter", "actor", "actress", "waiter", "waitress", "steward", "stewardess", 
    "barman", "barmaid", "mr", "ms", "miss","mrs", "mister", "husband", "wife", "great-aunt", "great-uncle", "stepbrother", "stepsister",
    "greatuncle", "greataunt", "bride", "groom", "man", "woman", "men", "women", "girl", "boy"
]

names = [
    "Aaron", "Abigail", "Adam", "Adams", "Adira", "Adrian", "Adriana", "Aiden", "Aisha", "Alan", "Aleena", "Alex",
    "Alexander", "Alice", "Alina", "Alia", "Alma", "Alonzo", "Alvarez", "Amalia", "Amanda", "Amara", "Amari", "Amelia",
    "Amir", "Amira", "Amina", "Amy", "Ana", "Anderson", "Andre", "Andrew", "Anita", "Anna", "Anika", "Ann", "Anthony",
    "Antonia", "Antonio", "Anya", "April", "Arel", "Ariel", "Arthur", "Asher", "Aurora", "Ava", "Ayaan", "Basil",
    "Beatrice", "Bell", "Bella", "Ben", "Benjamin", "Bennett", "Binyamin", "Bjorn", "Boaz", "Brenda", "Brianna", "Briana", "Brooke",
    "Brooks", "Brown", "Bruno", "Caleb", "Callahan", "Camila", "Camille", "Captain", "Carla", "Carlos", "Carmen", "Carina",
    "Carter", "Cass", "Celeste", "Celia", "Chang", "Charles", "Charlotte", "Chava", "Chen", "Cheryl", "Chester", "Chika",
    "Chloe", "Chris", "Christine", "Christopher", "Claire", "Clara", "Clark", "Cleo", "Clyde", "Cole", "Colin", "Collins",
    "Connell", "Connor", "Cora", "Dalia", "Dalit", "Dana", "Daniel", "Daniela", "Daniella", "Dario", "Daril",
    "David", "Davies", "Davis", "Dax", "Dean", "Delilah", "Demetrius", "Dennis", "Desmond", "Devin", "Diana", "Diego",
    "Dina", "Dominic", "Dorian", "Dorothea", "Dubois", "Dylan", "Ed", "Edward", "Elan", "Eleanor", "Elias", "Elijah",
    "Elise","Ellis", "Eliza", "Elizabeth", "Ella", "Elodie", "Eloise", "Emilia", "Emil", "Emily", "Emma", "Enzo", "Erez", "Esme",
    "Estela", "Ethan", "Ethel", "Eunice", "Eva", "Evans", "Evelyn", "Ezra", "Farah", "Farid", "Fatima", "Felicity",
    "Felipe", "Felix", "Fernando", "Fiona", "Flora", "Flynn", "Foster", "Franco", "Freya", "Frida", "Frieda", "Gabriella",
    "Gabriel", "Garcia", "Genevieve", "George", "Gideon", "Giselle", "Gomez", "Grace", "Granddad", "Grandma", "Grandpa",
    "Greg", "Gregory", "Gupta", "Hal", "Halim", "Hana", "Hannah", "Harper", "Harriet", "Harrison", "Hassan",
    "Hayes", "Hazel", "Helen", "Henry", "Holloway", "Horace", "Hunter", "Hugo", "Ibtisam", "Ibrahim", "Imani", "Ines",
    "Inez", "Ingrid", "Iris", "Isaac", "Isaiah", "Isabelle", "Isabella", "Ismael", "Isolde", "Ivy", "Jack", "Jackson",
    "Jacob", "Jada", "Jake", "Jamal", "James", "Jamie", "Janine", "Jason", "Jasper", "Javier", "Jayden", "Jean",
    "Jefferson", "Jenna", "Jennifer", "Jensen", "Jessica", "Joaquin", "Jocelyn", "John", "Johnson", "Jonah", "Jonas",
    "Jones", "Jordan", "Jose", "Joseph", "Joyce", "Jude", "Julia", "Julian", "Julien", "Juliet", "June", "Kai",
    "Kaia", "Karen", "Kayla", "Keiko", "Keon", "Kevin", "Khalil", "Kiara", "Kim", "King", "Kyle", "Laila", "Laleh",
    "Laura", "Layla", "Leah", "Lee", "Leila", "Lena", "Leo", "Leona", "Levi", "Lewis", "Liam", "Lila", "Lilia", "Lily",
    "Lin", "Lina", "Linda", "Lior", "Liora", "Lisa", "Livia", "Logan", "Lorena", "Loretta", "Louis", "Luc", "Lucas",
    "Lucia", "Lucille", "Lucy", "Ludmila", "Luis", "Luke", "Luna", "Luma", "Lyra", "Maddy", "Maor", "Malcolm", "Malik",
    "Malika", "Marco","Maren", "Margaret", "Maria", "Mariana", "Marie", "Marina", "Mario", "Mark", "Marvin","Marko", "Mason", "Mateo",
    "Max", "Maximilian", "Maxwell", "Maya", "Meera", "Mei", "Mendez", "Mia", "Micah", "Michael", "Michelle", "Miguel",
    "Miles", "Miller", "Millie", "Milo", "Mimi", "Miriam", "Miri", "Mitchell","Mike", "Moira", "Morris", "Morse", "Nadav",
    "Naomi", "Nava", "Nathan", "Natalie", "Nazir", "Neil", "Nelson", "Nia", "Nicanor", "Nico", "Nicole", "Nikolai", "Nina",
    "Noa", "Noah", "Nolan", "Noor", "Nora", "Noura", "Nur", "Nyla", "Olga", "Oliver", "Olivia", "Omar", "Omri", "Orion",
    "Orlando", "Orly", "Oscar", "Otis", "Owen", "Paloma", "Paolo", "Parker", "Patel", "Patricia", "Patterson", "Paul",
    "Pedro", "Percy", "Perez", "Peter", "Peterson", "Petra", "Phillips", "Piper", "Priya", "Rachel", "Rafael", "Rajani",
    "Rami", "Ramiro", "Ramona", "Ramirez", "Raul", "Raya", "Reed", "Reece", "Renata", "Reuben", "Reza", "Reynolds",
    "Richard", "Riley", "Rina", "Robert", "Roberts", "Roberto", "Rodriguez", "Ron", "Rose", "Rosie", "Rossi", "Rowan",
    "Ruben","Rubik","Ruby", "Rufus", "Ruth", "Ryan", "Sadie", "Safiya", "Salma", "Sam", "Samantha", "Samuel", "Sanchez",
    "Santiago", "Santos", "Saoirse", "Sarah", "Scarlett", "Scott", "Selene", "Serena", "Sergio", "Seth", "Shalom",
    "Sharma", "Sheldon", "Shira", "Silas", "Sima", "Simon", "Simone", "Singh", "Sienna", "Skye", "Smith", "Sofia", "Sophia",
    "Solomon", "Sophie", "Soraya", "Sorin","Steve", "Susan", "Swan", "Talia", "Tanya", "Tanvi", "Tasha", "Taylor",
    "Tessa", "Thalia", "Thea", "Theo", "Thiago", "Thomas", "Thompson", "Tilda", "Tim", "Tobias", "Tom", "Tomas", "Tony",
    "Tova", "Trevor", "Turner", "Tyrell", "Uncle", "Valentina", "Valeria", "Valerie", "Vance", "Vera", "Victor",
    "Victoria", "Violetta", "Violet", "Vivian", "Wallace", "Walter", "Watson", "Wesley", "White", "William", "Williams",
    "Willow", "Wilson", "Xavier", "Xander", "Yara", "Yasmin", "Yasmine", "Yosef", "Yvonne", "Yusuf", "Zach", "Zahara",
    "Zahavi", "Zahid", "Zahra", "Zaria", "Zara", "Zeke", "Zelda", "Zoe", "Zoya", "Zuri"
]

# Check for capitalized names (not the first word)
def contains_name(sentence):
    words = re.findall(r'\b\w+\b', sentence)
    return any(word in names for word in words)

# Detect gendered job terms using patterns
def contains_gendered_term(text):
    return bool(re.search(r'\b\w*(man|woman|boy|girl)\b', str(text).lower()))

# Determine if sentence is not biased
def is_not_biased(sentence):
    sentence_lower = sentence.lower()
    words = re.findall(r'\b\w+\b', sentence)
    return (
        any(term in words for term in gender_terms)
        or contains_name(sentence)
        or contains_gendered_term(sentence)
    )

# Apply labeling
df['bias_label'] = df['Sentence1'].apply(lambda s: 0 if is_not_biased(s) else 1)

# Show a sample
print(df.head())

                                           Sentence1  \
0  My cousin opened a tiny repair shop near the t...   
1  From the back of the classroom, the teacher wa...   
2  Few knew that my grandma had once worked as a ...   
3  The firefighter, soaked in sweat, emerged with...   
4  There's a chef in our neighborhood who never r...   

                                       Sentence2  bias_label  
0      He's been fixing watches since childhood.           1  
1                      Her patience was endless.           1  
2  She never talked about her broadcasting days.           0  
3                 His bravery stunned the crowd.           1  
4                  People admire his creativity.           1  


In [66]:
df.head()

Unnamed: 0,Sentence1,Sentence2,bias_label
0,My cousin opened a tiny repair shop near the t...,He's been fixing watches since childhood.,1
1,"From the back of the classroom, the teacher wa...",Her patience was endless.,1
2,Few knew that my grandma had once worked as a ...,She never talked about her broadcasting days.,0
3,"The firefighter, soaked in sweat, emerged with...",His bravery stunned the crowd.,1
4,There's a chef in our neighborhood who never r...,People admire his creativity.,1


### plotting distribution of labels:

In [68]:
label_counts = df['bias_label'].value_counts().sort_index()
print(f"Not biased (0): {label_counts.get(0, 0)}")
print(f"Biased (1): {label_counts.get(1, 0)}")

Not biased (0): 2268
Biased (1): 1899


In [69]:
print(f"Num rows: {len(df)}")

Num rows: 4167


# A Problem of Unisex names
To label our dataset correctly, we need to get rid of unisex names, so we would be able to label all sentence pairs with names in the first sentence to be not biased.<br>
To solve the problem of unisex names, we went through every name in the names dictionary we created, and constructed all Unisex names.<br>

We also constructed 2 more dictionaries, that contain "boys names" and "girls names" that we found as not unisex names.<br>
Then, we wrote this code that searches for these Unisex names in the Sentence1 column, and if finds them, replaces them with explicit gender names (this code considered sensitive switching and switched boys names with a name from boys_replacement_names dictionary, and girls names with a name from girls_replacement_names - using the gender term from sentence2).

In [74]:
import pandas as pd
import re
import random

# List of names to check in Sentence1 (deduplicated)
names_to_check = list(set([
    'Alex', 'Ariel', 'Brooke', 'Carter', 'Charlie', 'Chris', 'Dana', 'Dylan', 'Harper', 'Hunter', 
    'Jamie', 'Jordan', 'Kai', 'Logan', 'Riley', 'Rowan', 'Taylor', 'Anderson', 'Arel', 'Ayaan', 
    'Basil', 'Callahan', 'Chava', 'Chen', 'Dorian', 'Evans', 'Gabriel', 'Hal', 'Harrison', 'Hayes', 
    'Holloway', 'Isolde', 'Jayden', 'Jensen', 'Jose', 'Jude', 'June', 'Julian', 'Julien', 'Joyce', 
    'Lior', 'Maren', 'Morse', 'Orion', 'Parker', 'Percy', 'Rajani', 'Reece', 'Reed', 'Ruby', 'Ryan', 
    'Sam', 'Sanchez', 'Santos', 'Saoirse', 'Silas', 'Simon', 'Simone', 'Singh', 'Tanvi', 
    'Tony', 'Tyrell', 'Vance', 'Wallace', 'Wesley', 'White', 'Zeke'
]))

# Replacement names for boys
boys_replacement_names = [
    'Abel', 'Archer', 'August', 'Axel', 'Barrett', 'Beau', 'Beckett', 'Bentley', 'Brady', 'Brandon',
    'Brayden', 'Bryce', 'Caden', 'Camden', 'Carson', 'Chase', 'Christian', 'Cody', 'Colton', 'Cooper',
    'Damian', 'Derek', 'Donovan', 'Drew', 'Easton', 'Elliott', 'Emmett', 'Finn', 'Gavin', 'Grayson',
    'Greyson', 'Holden', 'Hudson', 'Jace', 'Jaxon', 'Jeremiah', 'Josiah', 'Kieran', 'Knox', 'Landon',
    'Lennox', 'Maddox', 'Quentin', 'Reid', 'Rhett', 'Sawyer', 'Tucker', 'Vincent', 'Walker', 'Wyatt'
]

# Replacement names for girls
girls_replacement_names = [
    'Addison', 'Alexis', 'Alyssa', 'Ashley', 'Audrey', 'Brooklyn', 'Caroline', 'Cecilia', 'Daisy',
    'Destiny', 'Eden', 'Elaina', 'Ember', 'Faith', 'Hailey', 'Harlow', 'Haven', 'Josephine',
    'Katelyn', 'Katherine', 'Kaylee', 'Laurel', 'Leilani', 'Libby', 'Lillian', 'Lottie', 'Madeline',
    'Madison', 'Makayla', 'Marissa', 'Mckenna', 'Mila', 'Nadia', 'Noelle', 'Paige', 'Penelope', 'Quinn',
    'Savannah', 'Saylor', 'Scarlett', 'Skylar', 'Stella', 'Summer', 'Sydney', 'Trinity', 'Vanessa', 'Zoey'
]

# Gender-specific terms
girl_terms = ['her', 'hers', 'she', "herself", "she's", "she'd", "she'll"]
boy_terms = ['his', 'him', 'he', "he's","himself", "he'd", "he'll"]

# Load the dataset
df = pd.read_csv('labeled_dataset1.csv')

# Counter for replacements
replacement_count = 0

# Function to replace names in a sentence
def replace_name(sentence, name, gender):
    if gender == 'girl':
        new_name = random.choice(girls_replacement_names)
    elif gender == 'boy':
        new_name = random.choice(boys_replacement_names)
    else:
        return sentence, False
    # Use regex for case-insensitive whole-word replacement
    pattern = r'\b' + re.escape(name) + r'\b'
    new_sentence = re.sub(pattern, new_name, sentence, flags=re.IGNORECASE)
    return new_sentence, new_sentence != sentence

# Process each row
for idx, row in df.iterrows():
    sentence1 = str(row['Sentence1'])
    sentence2 = str(row['Sentence2']).lower()
    
    # Determine gender based on Sentence2
    gender = None
    for term in girl_terms:
        if term in sentence2:
            gender = 'girl'
            break
    if not gender:
        for term in boy_terms:
            if term in sentence2:
                gender = 'boy'
                break
    
    # Skip if no gender-specific term is found
    if not gender:
        continue
    
    # Check for names in Sentence1
    for name in names_to_check:
        if re.search(r'\b' + re.escape(name) + r'\b', sentence1, re.IGNORECASE):
            # Replace the name
            new_sentence1, replaced = replace_name(sentence1, name, gender)
            if replaced:
                df.at[idx, 'Sentence1'] = new_sentence1
                replacement_count += 1
                break  # Replace only the first matching name in each sentence

# Save the modified dataset
df.to_csv('Final_sentences_dataset.csv', index=False)

print(f"Total replacements made: {replacement_count}")
print(f"Modified dataset saved to 'replaced_dataset.csv'")

Total replacements made: 223
Modified dataset saved to 'replaced_dataset.csv'


In [78]:
label_counts = df1['bias_label'].value_counts().sort_index()
print(f"Not biased (0): {label_counts.get(0, 0)}")
print(f"Biased (1): {label_counts.get(1, 0)}")

Not biased (0): 1760
Biased (1): 1659
