In [4]:
import pandas as pd
import csv

In [3]:
male_words = ['him', 'his', 'himself', 'man', 'men', 'father', 'brother', 'son', 'husband', 'boy', 'male']
female_words = ['she', 'her', 'hers', 'herself', 'woman', 'women', 'mother', 'sister', 'daughter', 'wife', 'girl', 'female']

In [9]:
with open('careers.csv', mode='r', newline='') as input_file, open('careers_with_gender.csv', mode='w', newline='') as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)
    # Read the header row from the input file and write it to the output file with an additional "gender" column
    
    header = next(reader)
    header.append('Gender')
    writer.writerow(header)
    
    # Loop through each row in the input file
    for row in reader:
        male_count = 0
        female_count = 0
        # Join all the columns into a single string
        text = ' '.join(row)
        for word in text.split():
            if word.lower() in male_words:
                male_count += 1
            elif word.lower() in female_words:
                female_count += 1
        if (male_count>0):
            print(male_count)
        if(female_count>0):
            print(female_count)
        if male_count > female_count:
            row.append('Male')
        elif female_count > male_count:
            row.append('Female')
        else:
            row.append('')
        writer.writerow(row)


1
3
4
3
1
3
4
1
1
2
2
1
3
4


In [10]:
df = pd.read_csv("careers_with_gender.csv")

In [11]:
# Select the desired columns
columns_to_keep = ['Category', 'Job Title', 'Job Description', 'Job URL', 'Gender']
df = df[columns_to_keep]

In [12]:
# Save the updated dataframe to a new CSV file
df.to_csv('careers_with_gender.csv', index=False)

### Comparison with gendered prompts

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv('careers_single.csv')

In [6]:
# Load the sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [7]:
# Read in the list of gendered stereotype sentences
with open('gendered_prompts.txt', 'r') as f:
    stereotype_sentences = [line.strip() for line in f.readlines()]

In [8]:
# Create embeddings for each sentence in the dataset and the stereotype sentences
dataset_embeddings = model.encode(df['text'].tolist())
stereotype_embeddings = model.encode(stereotype_sentences)

In [9]:
# Compute the cosine similarity between each sentence in the dataset and each stereotype sentence
similarity_scores = cosine_similarity(dataset_embeddings, stereotype_embeddings)

In [10]:
# Print the top-k most similar sentences for each stereotype sentence
k = 3  # top-k most similar sentences
for i, stereotype_sentence in enumerate(stereotype_sentences):
    print(f"Most similar sentences for gendered prompt '{stereotype_sentence}':")
    top_k_indices = similarity_scores[:, i].argsort()[-k:][::-1]
    for idx in top_k_indices:
        print(f"\t- '{df.iloc[idx]['text']}' (similarity score: {similarity_scores[idx, i]:.2f})")
    print()

Most similar sentences for gendered prompt 'He is a doctor, she is a nurse.':
	- 'physician associates support doctors to diagnose and manage patients in hospitals and gp surgeries' (similarity score: 0.82)
	- 'practice nurses work in gp surgeries to assess screen treat and educate patients and help doctors give medical care' (similarity score: 0.79)
	- 'you could take on a specialist role like clinical technician working with healthcare professionals and patients on wards' (similarity score: 0.78)

Most similar sentences for gendered prompt 'Men are better at math and science than women.':
	- 'employers may be more interested in your practical ability than formal qualifications although itll help if you have gcses in subjects like english art and design maths and design and technology' (similarity score: 0.71)
	- 'employers may be more interested in your practical ability than formal qualifications although itll help if you have gcses in subjects like english art and design maths and 