In [47]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.model_selection import train_test_split

In [2]:
# Loading JSON files
folder = "data/"

with open(folder + 'mixed_data.json') as f:
    mixed_data = json.load(f)

with open(folder + 'F_data.json') as f:
    F_data = json.load(f)

with open(folder + 'M_data.json') as f:
    M_data = json.load(f)

In [4]:
# Checking if the datasets are balanced
print('Checking label balance within the different datasets:')

n_sarcastic_utterances = len([(key, value) for key, value in mixed_data.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in mixed_data.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The mixed dataset is balanced')
else:
    print('Mixed sarcastic utterances:', n_sarcastic_utterances)
    print('Mixed non sarcastic utterances:', n_non_sarcastic_utterances)

M_n_sarcastic_utterances = len([(key, value) for key, value in M_data.items() if value['sarcasm']])
M_n_non_sarcastic_utterances = len([(key, value) for key, value in M_data.items() if not value['sarcasm']])
if M_n_sarcastic_utterances == M_n_non_sarcastic_utterances:
    print('The male dataset is balanced')
else:
    print('Male sarcastic utterances:', M_n_sarcastic_utterances)
    print('Male non sarcastic utterances:', M_n_non_sarcastic_utterances)

F_n_sarcastic_utterances = len([(key, value) for key, value in F_data.items() if value['sarcasm']])
F_n_non_sarcastic_utterances = len([(key, value) for key, value in F_data.items() if not value['sarcasm']])
if F_n_sarcastic_utterances == F_n_non_sarcastic_utterances:
    print('The female dataset is balanced')
else:
    print('Female sarcastic utterances:', F_n_sarcastic_utterances)
    print('Female non sarcastic utterances:', F_n_non_sarcastic_utterances)

print("")

# Checking if the mixed dataset is balanced by gender
n_male_utterances = len([(key, value) for key, value in mixed_data.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in mixed_data.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The original dataset is balanced by gender')
else:
    print('Checking gender balance of the filtered dataset:')
    print('Male utterances:', n_male_utterances)
    print('Female utterances:', n_female_utterances)

Checking label balance within the different datasets:
Mixed sarcastic utterances: 545
Mixed non sarcastic utterances: 564
Male sarcastic utterances: 360
Male non sarcastic utterances: 375
Female sarcastic utterances: 185
Female non sarcastic utterances: 189

Checking gender balance of the filtered dataset:
Male utterances: 735
Female utterances: 374


We can consider that the datasets are balanced by sarcasm. 
However, the mixed dataset is not balanced by gender. There are more male utterances than female utterances. To avoid losing data quality, we chose to perform undersampling. In other words, we will drop male utterances in order to balance the mixed dataset by gender.  

In [44]:
male_utterances = [(key, value) for key, value in mixed_data.items() if value['gender'] == 'M']
female_utterances = [(key, value) for key, value in mixed_data.items() if value['gender'] == 'F']

# Undersampling of male utterances
random.seed(42)
selected_male_utterances = random.sample(male_utterances, k = len(female_utterances)) 

balanced_mixed_data = dict(selected_male_utterances + female_utterances)

In [46]:
# Check data augmentation of mixed dataset
n_female_utterances = len([(key, value) for key, value in balanced_mixed_data.items() if value['gender'] == 'F'])
n_male_utterances = len([(key, value) for key, value in balanced_mixed_data.items() if not value['gender'] == 'F'])
n_sarcastic_utterances = len([(key, value) for key, value in balanced_mixed_data.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in balanced_mixed_data.items() if not value['sarcasm']])

print('Utterances by men in the mixed dataset after augmentation:', n_male_utterances)
print('Utterances by women in the mixed dataset after augmentation:', n_female_utterances)
print('Sarcastic utterances in the mixed dataset after augmentation:', n_sarcastic_utterances)
print('Non sarcastic utterances in the mixed dataset after augmentation:', n_non_sarcastic_utterances)

Utterances by men in the mixed dataset after augmentation: 374
Utterances by women in the mixed dataset after augmentation: 374
Sarcastic utterances in the mixed dataset after augmentation: 373
Non sarcastic utterances in the mixed dataset after augmentation: 375


In [48]:
# Making sure the training, validation and test datasets are also balanced.
female_sarcastic_utterances = [(key, value) for key, value in balanced_mixed_data.items() if value['gender'] == 'F' and value['sarcasm']]
female_non_sarcastic_utterances = [(key, value) for key, value in balanced_mixed_data.items() if value['gender'] == 'F' and not value['sarcasm']]
male_sarcastic_utterances = [(key, value) for key, value in balanced_mixed_data.items() if value['gender'] == 'M' and value['sarcasm']]
male_non_sarcastic_utterances = [(key, value) for key, value in balanced_mixed_data.items() if value['gender'] == 'M' and not value['sarcasm']]

# Splitting the data into 70% training, 15% validation and 15% test
FS_train, FS_test_val = train_test_split(female_sarcastic_utterances, test_size = 0.3)
FS_test, FS_val = train_test_split(FS_test_val, test_size = 0.5)

FnS_train, FnS_test_val = train_test_split(female_non_sarcastic_utterances, test_size = 0.3)
FnS_test, FnS_val = train_test_split(FnS_test_val, test_size = 0.5)

MS_train, MS_test_val = train_test_split(male_sarcastic_utterances, test_size = 0.3)
MS_test, MS_val = train_test_split(MS_test_val, test_size = 0.5)

MnS_train, MnS_test_val = train_test_split(male_non_sarcastic_utterances, test_size = 0.3)
MnS_test, MnS_val = train_test_split(MnS_test_val, test_size = 0.5)

train_set_mixed = dict(FS_train + FnS_train + MS_train + MnS_train)
val_set_mixed = dict(FS_val + FnS_val + MS_val + MnS_val)
test_set_mixed =  dict(FS_test + FnS_test + MS_test + MnS_test)

In [50]:
print("Let's check the size of the different datasets: ")

print("Mixed data total", len(balanced_mixed_data))
print("Mixed train", len(train_set_mixed))
print("Mixed val", len(val_set_mixed))
print("Mixed test", len(test_set_mixed))

Let's check the size of the different datasets: 
Mixed data total 748
Mixed train 522
Mixed val 114
Mixed test 112


In [53]:
# Checking if the datasets are balanced
print('Checking label balance within the different datasets:')
print('mixed train - sarcastic utterances:', len([(key, value) for key, value in train_set_mixed.items() if value['sarcasm']]))
print('mixed train - non sarcastic utterances:', len([(key, value) for key, value in train_set_mixed.items() if not value['sarcasm']]))

n_sarcastic_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in test_set_mixed.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The mixed testing dataset is balanced')

n_sarcastic_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['sarcasm']])
n_non_sarcastic_utterances = len([(key, value) for key, value in val_set_mixed.items() if not value['sarcasm']])
if n_sarcastic_utterances == n_non_sarcastic_utterances:
    print('The mixed validation dataset is balanced')

print("")

# Checking if the mixed dataset is balanced by gender
n_male_utterances = len([(key, value) for key, value in train_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in train_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed training dataset is balanced by gender')
else:
    print('Checking gender balance of the filtered dataset:')
    print('Male utterances:', n_male_utterances)
    print('Female utterances:', n_female_utterances)

n_male_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in test_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed testing dataset is balanced by gender')

n_male_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['gender'] == 'M'])
n_female_utterances = len([(key, value) for key, value in val_set_mixed.items() if value['gender'] == 'F'])
if n_male_utterances == n_female_utterances:
    print('The mixed training dataset is balanced by gender')

Checking label balance within the different datasets:
mixed train - sarcastic utterances: 260
mixed train - non sarcastic utterances: 262
The mixed testing dataset is balanced
The mixed validation dataset is balanced

The mixed training dataset is balanced by gender
The mixed testing dataset is balanced by gender
The mixed training dataset is balanced by gender


In [49]:
# Save subsets as JSON files to be used in training
with open('data/mixed_train_set.json', 'w') as f:
    json.dump(train_set_mixed, f, indent=4)

with open('data/mixed_val_set.json', 'w') as f:
    json.dump(val_set_mixed, f, indent=4)

with open('data/mixed_test_set.json', 'w') as f:
    json.dump(test_set_mixed, f, indent=4)