In [27]:
import json
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
import spacy
nlp = spacy.load("en_core_web_sm")
import random
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\celin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Importing the sarcasm data textual representation
file_path = "data/sarcasm_data.json"

# Loading JSON file
with open(file_path) as f:
    data = json.load(f)

In [29]:
# Look at the data
for key, value in data.items():
    print(key, value)
    break

1_60 {'utterance': "It's just a privilege to watch your mind at work.", 'speaker': 'SHELDON', 'context': ['I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.', "My apologies. What's your plan?"], 'context_speakers': ['LEONARD', 'SHELDON'], 'show': 'BBT', 'sarcasm': True}


For our project it is essential for us to be able to label the different utterances by gender (of the speaker). Therefore we will remove all entries spoken by an ambiguous gender (such as 'Person1' or 'Moderator'). We remarked that this will remove 57 utternaces, which we deem acceptable for the size of our dataset.  

In [30]:
# Filter out entries where speaker gender is unclear
filtered_data = {key: entry for key, entry in data.items() if entry['speaker'] not in ['PERSON', 'PERSON1', 'PERSON3', 'MODERATOR']}

We then assign gender labels to each speaker, and thus each utterance. This was done by hand as there were only 17 different speakers in our dataset.

In [31]:
# GENDER LABELING
# Creating a dictionnary to label the different speakers according to their gender
gender_mapping = {
    'SHELDON': 'M',
    'PENNY': 'F',
    'HOWARD': 'M',
    'LEONARD': 'M',
    'RAJ': 'M',
    'BERNADETTE': 'F',
    'AMY': 'F',
    'CHANDLER': 'M',
    'ROSS': 'M',
    'MONICA': 'F',
    'JOEY': 'M',
    'RACHEL': 'F',
    'PHOEBE': 'F',
    'DOROTHY': 'F',
    'ROSE': 'F',
    'MEMBER-GIRL': 'F',
    'MEMBER-BOY': 'M',
}

# Define a function to map speakers to their corresponding gender
def map_gender(speaker):
    return gender_mapping.get(speaker, 'Unknown')

# Iterate through the items in the JSON object and add a new key 'gender' for each entry
for key, entry in filtered_data.items():
    entry['gender'] = map_gender(entry['speaker'])

In [32]:
# Filter data by gender
F_data = {key: entry for key, entry in filtered_data.items() if entry['gender'] == 'F'}
M_data = {key: entry for key, entry in filtered_data.items() if entry['gender'] == 'M'}

In [33]:
print("Number of utterances by men:", len(M_data))
print("Number of utterances by women:", len(F_data))

Number of utterances by men: 430
Number of utterances by women: 203


In [8]:
# Save gender-split  and cleaned data to json files
with open('data/F_data.json', 'w') as f:
    json.dump(F_data, f, indent=4)

with open('data/M_data.json', 'w') as f:
    json.dump(M_data, f, indent=4)

with open('data/mixed_data.json', 'w') as f:
    json.dump(filtered_data, f, indent=4)

In this project we will build 3 different models:
- mixed model (trained on male + female)
- male model
- female model

In order for the models to be unbiased, the training datasets have to be balanced, i.e. there should be the same number of sarcastic and non-sarcastic utterances.

In addition, to not be biased towards one gender, the mixed dataset should also be balanced accoring to gender.

Let's check if this is the case.

In [9]:
# Checking if the datasets are balanced
print('Checking label balance within the different datasets:')
if len([(key, value) for key, value in data.items() if value['sarcasm']]) == len([(key, value) for key, value in data.items() if not value['sarcasm']]):
    print('The original dataset is balanced')

n_sarcastic_utterances = len([(key, value) for key, value in filtered_data.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in filtered_data.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The filtered dataset is balanced')
else:
    print('Sarcastic utterances:', n_sarcastic_utterances)
    print('Non sarcastic utterances:', n_non_sarcastic_utterances)

M_n_sarcastic_utterances = len([(key, value) for key, value in M_data.items() if value['sarcasm']])
M_n_non_sarcastic_utterances = len([(key, value) for key, value in M_data.items() if not value['sarcasm']])
if M_n_sarcastic_utterances == M_n_non_sarcastic_utterances:
    print('The male dataset is balanced')
else:
    print('Male sarcastic utterances:', M_n_sarcastic_utterances)
    print('Male non sarcastic utterances:', M_n_non_sarcastic_utterances)

F_n_sarcastic_utterances = len([(key, value) for key, value in F_data.items() if value['sarcasm']])
F_n_non_sarcastic_utterances = len([(key, value) for key, value in F_data.items() if not value['sarcasm']])
if F_n_sarcastic_utterances == F_n_non_sarcastic_utterances:
    print('The female dataset is balanced')
else:
    print('Female sarcastic utterances:', F_n_sarcastic_utterances)
    print('Female non sarcastic utterances:', F_n_non_sarcastic_utterances)

print("")

# Checking if the mixed dataset is balanced by gender
n_male_utterances = len([(key, value) for key, value in filtered_data.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in filtered_data.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The original dataset is balanced by gender')
else:
    print('Checking gender balance of the filtered dataset:')
    print('Male utterances:', n_male_utterances)
    print('Female utterances:', n_female_utterances)

Checking label balance within the different datasets:
The original dataset is balanced
Sarcastic utterances: 326
Non sarcastic utterances: 307
Male sarcastic utterances: 224
Male non sarcastic utterances: 206
Female sarcastic utterances: 102
Female non sarcastic utterances: 101

Checking gender balance of the filtered dataset:
Male utterances: 430
Female utterances: 203


As we can see the mixed dataset is not at all balanced by gender, as there are twice as many male utterances as female utterances. It is also not quite balanced in terms of sarcasm labels.

The male dataset is also not quite balanced, as there are a few more sarcastic utterances than non-sarcastic utterances.

The female dataset can be considered balances, as the difference between sarcastic and non-sarcastic utterances is only 1.

To balance the datasets we will use resampling and data augmentation.

In [10]:
# Some functions needed for data augmentation
INCLUDED_POS_TAGS = {"VERB", "NOUN", "ADJ"}

def get_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def replace_with_synonyms(sentence):
    doc = nlp(sentence)
    new_sentence = []

    for token in doc:
        if token.text.isalpha() and not token.is_stop and token.pos_ in INCLUDED_POS_TAGS:
            
            synonyms = [token.text]  # Starting with the original word
            
            # Finding synonyms from WordNet
            word_synonyms = get_synonyms(token.text)

            if word_synonyms and len(word_synonyms) > 1:
                
                # Selecting the second synonym. The first is often the word itself, whilst later synonyms are further away concerning similarity.
                synonym = word_synonyms[1]

                word_nlp = nlp(token.text)
                synonym_nlp = nlp(synonym)

                # Checking the similarity between the original word and the proposed synonym

                similarity = word_nlp.similarity(synonym_nlp)

                # If the similarity is greater then a set threshold of 0.6, we replace it in a new utterance

                if(similarity > 0.6):
                    new_sentence.append(synonym)
                else:
                    new_sentence.append(token.text)
            else:
                new_sentence.append(token.text)
        else:
            new_sentence.append(token.text)

    return ' '.join(new_sentence)

In [11]:
# DATA AUGMENTATION OF THE MALE DATASET

# Load the data from the JSON files
with open('data/M_data.json') as file:
    M_data = json.load(file)

# To balance the dataset 18 non-sarcastic utterances have to be added
# These are randomly selected from the non-sarcastic utterances and then augmented
non_sarcastic_utterances = [(key, value) for key, value in M_data.items() if not value['sarcasm']]
selected_keys = random.sample(non_sarcastic_utterances, k=18)

# Data augmentation
augmented_data = {}
for key, value in selected_keys:
    augmented_utterance = replace_with_synonyms(value['utterance'])
    augmented_data['A' + key] = {
        'utterance': augmented_utterance,
        'speaker': value['speaker'],
        'context': value['context'],
        'context_speakers': value['context_speakers'],
        'show': value['show'],
        'sarcasm': value['sarcasm'],
        'gender' : value['gender']
    }

# Combining the original data with the augmented data
M_combined_data = {**M_data, **augmented_data}

# SHOULD WE DROP DUPLICATES?? 

# Saving the combined data to a JSON file
with open('data/M_data_enriched.json', 'w') as f:
    json.dump(M_combined_data, f, indent=4)

  similarity = word_nlp.similarity(synonym_nlp)


In [12]:
# Check data augmentation of male dataset
n_sarcastic_utterances = len([(key, value) for key, value in M_combined_data.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in M_combined_data.items() if not value['sarcasm']])

print('Male sarcastic utterances after augmentation:', n_sarcastic_utterances)
print('Male non sarcastic utterances after augmentation:', n_non_sarcastic_utterances)
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The male dataset is now balanced')

Male sarcastic utterances after augmentation: 224
Male non sarcastic utterances after augmentation: 224
The male dataset is now balanced


In [13]:
# DATA AUGMENTATION OF THE MIXED DATASET

# Load the data from the JSON files
with open('data/mixed_data.json') as file:
    mixed_data = json.load(file)

# To balance the dataset 227 female utterances have to be added
# These are randomly selected from the non-sarcastic utterances and then augmented
female_utterances = [(key, value) for key, value in mixed_data.items() if value['gender'] == 'F']
selected_keys1 = random.sample(female_utterances, k=len(female_utterances))
female_non_sarcastic_utterances = [(key, value) for key, value in mixed_data.items() if value['gender'] == 'F' and not value['sarcasm']]
selected_keys2 = random.sample(female_non_sarcastic_utterances, k=24)
selected_keys2 = [('a'+ key, value) for key, value in selected_keys2]
selected_keys = selected_keys1 + selected_keys2

# Data augmentation
augmented_data = {}
for key, value in selected_keys:
    augmented_utterance = replace_with_synonyms(value['utterance'])
    augmented_data['A' + key] = {
        'utterance': augmented_utterance,
        'speaker': value['speaker'],
        'context': value['context'],
        'context_speakers': value['context_speakers'],
        'show': value['show'],
        'sarcasm': value['sarcasm'],
        'gender' : value['gender']
    }

# Combining the original data with the augmented data
mixed_combined_data = {**mixed_data, **augmented_data}

# SHOULD WE DROP DUPLICATES?? 

# Saving the combined data to a JSON file
with open('data/mixed_data_enriched.json', 'w') as f:
    json.dump(mixed_combined_data, f, indent=4)

  similarity = word_nlp.similarity(synonym_nlp)


In [14]:
# Check data augmentation of mixed dataset
n_female_utterances = len([(key, value) for key, value in mixed_combined_data.items() if value['gender'] == 'F'])
n_male_utterances = len([(key, value) for key, value in mixed_combined_data.items() if not value['gender'] == 'F'])
n_sarcastic_utterances = len([(key, value) for key, value in mixed_combined_data.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in mixed_combined_data.items() if not value['sarcasm']])

print('Utterances by men in the mixed dataset after augmentation:', n_male_utterances)
print('Utterances by women in the mixed dataset after augmentation:', n_female_utterances)
print('Sarcastic utterances in the mixed dataset after augmentation:', n_sarcastic_utterances)
print('Non sarcastic utterances in the mixed dataset after augmentation:', n_non_sarcastic_utterances)


Utterances by men in the mixed dataset after augmentation: 430
Utterances by women in the mixed dataset after augmentation: 430
Sarcastic utterances in the mixed dataset after augmentation: 428
Non sarcastic utterances in the mixed dataset after augmentation: 432


We have managed to balance the male dataset using data augmentation, so we will use the augmented dataset for training our models.

We have been able to balance the mixed dataset by gender. The balancing by sarcasm is not perfect but can be considered sufficient, relatively to the size of this dataset. We will therefore also use the augmented dataset for training with mixed data.

For female data, we can keep using the non-augmented dataset.

For training, we will split each dataset into 3 subsets: training, validation, and testing. To make sure that these three subsets are also balanced, we will define them here.

In [23]:
# Making sure the training, validation and test datasets are also balanced.

# Mixed
female_sarcastic_utterances = [(key, value) for key, value in mixed_combined_data.items() if value['gender'] == 'F' and value['sarcasm']]
female_non_sarcastic_utterances = [(key, value) for key, value in mixed_combined_data.items() if value['gender'] == 'F' and not value['sarcasm']]
male_sarcastic_utterances = [(key, value) for key, value in mixed_combined_data.items() if value['gender'] == 'M' and value['sarcasm']]
male_non_sarcastic_utterances = [(key, value) for key, value in mixed_combined_data.items() if value['gender'] == 'M' and not value['sarcasm']]

train_size = int(0.7 * len(female_sarcastic_utterances))
val_size = int(0.15 * len(female_sarcastic_utterances))
test_size = len(female_sarcastic_utterances) - train_size - val_size

FS_train, FS_test_val = train_test_split(female_sarcastic_utterances, test_size = 0.3)
FS_test, FS_val = train_test_split(FS_test_val, test_size = 0.5)

train_size = int(0.7 * len(female_non_sarcastic_utterances))
val_size = int(0.15 * len(female_non_sarcastic_utterances))
test_size = len(female_non_sarcastic_utterances) - train_size - val_size

FnS_train, FnS_test_val = train_test_split(female_non_sarcastic_utterances, test_size = 0.3)
FnS_test, FnS_val = train_test_split(FnS_test_val, test_size = 0.5)

train_size = int(0.7 * len(male_sarcastic_utterances))
val_size = int(0.15 * len(male_sarcastic_utterances))
test_size = len(male_sarcastic_utterances) - train_size - val_size

MS_train, MS_test_val = train_test_split(male_sarcastic_utterances, test_size = 0.3)
MS_test, MS_val = train_test_split(MS_test_val, test_size = 0.5)

train_size = int(0.7 * len(male_non_sarcastic_utterances))
val_size = int(0.15 * len(male_non_sarcastic_utterances))
test_size = len(male_non_sarcastic_utterances) - train_size - val_size

MnS_train, MnS_test_val = train_test_split(male_non_sarcastic_utterances, test_size = 0.3)
MnS_test, MnS_val = train_test_split(MnS_test_val, test_size = 0.5)

train_set_mixed = dict(FS_train + FnS_train + MS_train + MnS_train)
val_set_mixed = dict(FS_val + FnS_val + MS_val + MnS_val)
test_set_mixed =  dict(FS_test + FnS_test + MS_test + MnS_test)

print("Mixed dataset:", len(FS_train), len(FnS_train), len(MS_train), len(MnS_train))

Mixed dataset: 142 158 156 144


In [None]:
# Male
M_sarcastic_utterances = [(key, value) for key, value in M_combined_data.items() if value['gender'] == 'M' and value['sarcasm']]
M_non_sarcastic_utterances = [(key, value) for key, value in M_combined_data.items() if value['gender'] == 'M' and not value['sarcasm']]

train_size = int(0.8 * len(M_sarcastic_utterances))
val_size = int(0.1 * len(M_sarcastic_utterances))
test_size = len(M_sarcastic_utterances) - train_size - val_size

MS_train, MS_test_val = train_test_split(M_sarcastic_utterances, test_size = 0.2)
MS_test, MS_val = train_test_split(MS_test_val, test_size = 0.5)

train_size = int(0.8 * len(M_non_sarcastic_utterances))
val_size = int(0.1 * len(M_non_sarcastic_utterances))
test_size = len(M_non_sarcastic_utterances) - train_size - val_size

MnS_train, MnS_test_val = train_test_split(M_non_sarcastic_utterances, test_size = 0.2)
MnS_test, MnS_val = train_test_split(MnS_test_val, test_size = 0.5)

train_set_M = dict(MS_train + MnS_train)
val_set_M = dict(MS_val + MnS_val)
test_set_M = dict(MS_test + MnS_test)

In [36]:
# Female
F_sarcastic_utterances = [(key, value) for key, value in F_data.items() if value['gender'] == 'F' and value['sarcasm']]
F_non_sarcastic_utterances = [(key, value) for key, value in F_data.items() if value['gender'] == 'F' and not value['sarcasm']]

train_size = int(0.8 * len(F_sarcastic_utterances))
val_size = int(0.1 * len(F_sarcastic_utterances))
test_size = len(F_sarcastic_utterances) - train_size - val_size

FS_train, FS_test_val = train_test_split(F_sarcastic_utterances, test_size = 0.2)
FS_test, FS_val = train_test_split(FS_test_val, test_size = 0.5)

train_size = int(0.8 * len(F_non_sarcastic_utterances))
val_size = int(0.1 * len(F_non_sarcastic_utterances))
test_size = len(F_non_sarcastic_utterances) - train_size - val_size

FnS_train, FnS_test_val = train_test_split(F_non_sarcastic_utterances, test_size = 0.2)
FnS_test, FnS_val = train_test_split(FnS_test_val, test_size = 0.5)

train_set_F = dict(FS_train + FnS_train)
val_set_F = dict(FS_val + FnS_val)
test_set_F = dict(FS_test + FnS_test)

In [37]:
# Save subsets as JSON files to be used in training
with open('data/mixed_train_set.json', 'w') as f:
    json.dump(train_set_mixed, f, indent=4)

with open('data/mixed_val_set.json', 'w') as f:
    json.dump(val_set_mixed, f, indent=4)

with open('data/mixed_test_set.json', 'w') as f:
    json.dump(test_set_mixed, f, indent=4)

with open('data/M_train_set.json', 'w') as f:
    json.dump(train_set_M, f, indent=4)

with open('data/M_val_set.json', 'w') as f:
    json.dump(val_set_M, f, indent=4)

with open('data/M_test_set.json', 'w') as f:
    json.dump(test_set_M, f, indent=4)

with open('data/F_train_set_alternative.json', 'w') as f:
    json.dump(train_set_F, f, indent=4)

with open('data/F_val_set_alternative.json', 'w') as f:
    json.dump(val_set_F, f, indent=4)

with open('data/F_test_set_alternative.json', 'w') as f:
    json.dump(test_set_F, f, indent=4)

In [26]:
print("Let's check the size of the different datasets: ")

print("Mixed data total", len(mixed_combined_data))
print("Mixed train", len(train_set_mixed))
print("Mixed val", len(val_set_mixed))
print("Mixed test", len(test_set_mixed))

print("")
print("M data total", len(M_combined_data))
print("M train", len(train_set_M))
print("M val", len(val_set_M))
print("M test", len(test_set_M))

print("")
print("F data total", len(F_data))
print("F train", len(train_set_F))
print("F val", len(val_set_F))
print("F test", len(test_set_F))

Let's check the size of the different datasets: 
Mixed data total 860
Mixed train 600
Mixed val 130
Mixed test 130

M data total 448
M train 312
M val 68
M test 68

F data total 203
F train 141
F val 32
F test 30


In [24]:
# Checking if the datasets are balanced
print('Checking label balance within the different datasets:')
if len([(key, value) for key, value in test_set_mixed.items() if value['sarcasm']]) == len([(key, value) for key, value in train_set_mixed.items() if not value['sarcasm']]):
    print('The mixed training dataset is balanced')
else:
    print('The mixed training dataset is NOT balanced')
    print('mixed train - sarcastic utterances:', len([(key, value) for key, value in train_set_mixed.items() if value['sarcasm']]))
    print('mixed train - non sarcastic utterances:', len([(key, value) for key, value in train_set_mixed.items() if not value['sarcasm']]))

n_sarcastic_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in test_set_mixed.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The mixed testing dataset is balanced')

n_sarcastic_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in val_set_mixed.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The mixed validation dataset is balanced')


print("")

M_n_sarcastic_utterances = len([(key, value) for key, value in train_set_M.items() if value['sarcasm']])
M_n_non_sarcastic_utterances = len([(key, value) for key, value in train_set_M.items() if not value['sarcasm']])
if M_n_sarcastic_utterances == M_n_non_sarcastic_utterances:
    print('The male training dataset is balanced')

M_n_sarcastic_utterances = len([(key, value) for key, value in test_set_M.items() if value['sarcasm']])
M_n_non_sarcastic_utterances = len([(key, value) for key, value in test_set_M.items() if not value['sarcasm']])
if M_n_sarcastic_utterances == M_n_non_sarcastic_utterances:
    print('The male testing dataset is balanced')

M_n_sarcastic_utterances = len([(key, value) for key, value in val_set_M.items() if value['sarcasm']])
M_n_non_sarcastic_utterances = len([(key, value) for key, value in val_set_M.items() if not value['sarcasm']])
if M_n_sarcastic_utterances == M_n_non_sarcastic_utterances:
    print('The male validation dataset is balanced')


print("")

F_n_sarcastic_utterances = len([(key, value) for key, value in train_set_F.items() if value['sarcasm']])
F_n_non_sarcastic_utterances = len([(key, value) for key, value in train_set_F.items() if not value['sarcasm']])
if F_n_sarcastic_utterances == F_n_non_sarcastic_utterances:
    print('The female training dataset is balanced')
else:
    print('The female training dataset is NOT balanced')
    print('female train - sarcastic utterances:', F_n_sarcastic_utterances)
    print('female train - non sarcastic utterances:', F_n_non_sarcastic_utterances)

F_n_sarcastic_utterances = len([(key, value) for key, value in test_set_F.items() if value['sarcasm']])
F_n_non_sarcastic_utterances = len([(key, value) for key, value in test_set_F.items() if not value['sarcasm']])
if F_n_sarcastic_utterances == F_n_non_sarcastic_utterances:
    print('The female testing dataset is balanced')

F_n_sarcastic_utterances = len([(key, value) for key, value in val_set_F.items() if value['sarcasm']])
F_n_non_sarcastic_utterances = len([(key, value) for key, value in val_set_F.items() if not value['sarcasm']])
if F_n_sarcastic_utterances == F_n_non_sarcastic_utterances:
    print('The female validation dataset is balanced')


print("")

# Checking if the mixed dataset is balanced by gender
n_male_utterances = len([(key, value) for key, value in train_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in train_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed training dataset is balanced by gender')
else:
    print('Checking gender balance of the filtered dataset:')
    print('Male utterances:', n_male_utterances)
    print('Female utterances:', n_female_utterances)

n_male_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed testing dataset is balanced by gender')

n_male_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed training dataset is balanced by gender')

Checking label balance within the different datasets:
The mixed training dataset is NOT balanced
mixed train - sarcastic utterances: 298
mixed train - non sarcastic utterances: 302
The mixed testing dataset is balanced
The mixed validation dataset is balanced

The male training dataset is balanced
The male testing dataset is balanced
The male validation dataset is balanced

The female training dataset is NOT balanced
female train - sarcastic utterances: 71
female train - non sarcastic utterances: 70
The female testing dataset is balanced
The female validation dataset is balanced

The mixed training dataset is balanced by gender
The mixed testing dataset is balanced by gender
The mixed training dataset is balanced by gender
