In [9]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.model_selection import train_test_split

In [10]:
# Loading JSON files
folder = "data/"

with open(folder + 'reformatted_data.json') as f:
    mixed_data = json.load(f)

### Summary to prepare for splitting of data

*Female Utterances*
> Sarcastic:        185
> Non-Sarcastic:    189


*Male Utterances*
> Sarcastic:        360
> Non-Sarcastic:    375

**Female Only Model:**
> 184 Sarcastic Female, 184 Non-Sarcastic Female

**Male Only Model:**
> 184 Sarcastic Male, 184 Non-Sarcastic Male

**Mixed Model:**
> 92 Sarcastic Male, 92 Non-Sarcastic Male, 92 Sarcastic Female, 92 Non-Sarcastic Female

In [11]:
## Creating the Female Dataset

data = mixed_data

# Set seed for reproducibility
random.seed(42)

# Filtering entries where gender is "F" and sarcasm is TRUE
filtered_entries_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "F" and value.get('sarcasm') == True}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_sarcastic = dict(random.sample(list(filtered_entries_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_sarcastic = filtered_entries_sarcastic

# Filtering entries where gender is "F" and sarcasm is FALSE
filtered_entries_non_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "F" and value.get('sarcasm') == False}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_non_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_non_sarcastic = dict(random.sample(list(filtered_entries_non_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_non_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_non_sarcastic = filtered_entries_non_sarcastic

# Combining the selected sarcastic and non-sarcastic entries
combined_entries = {**selected_entries_sarcastic, **selected_entries_non_sarcastic}

# Saving the selected entries to a new JSON file
with open(folder + 'F_data.json', 'w') as f:
    json.dump(combined_entries, f, indent=4)

In [12]:
## Creating the Male dataset

# Set seed for reproducibility
random.seed(42)

# Filtering entries where gender is "M" and sarcasm is TRUE
filtered_entries_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "M" and value.get('sarcasm') == True}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_sarcastic = dict(random.sample(list(filtered_entries_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_sarcastic = filtered_entries_sarcastic

# Filtering entries where gender is "M" and sarcasm is FALSE
filtered_entries_non_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "M" and value.get('sarcasm') == False}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_non_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_non_sarcastic = dict(random.sample(list(filtered_entries_non_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_non_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_non_sarcastic = filtered_entries_non_sarcastic

# Combining the selected sarcastic and non-sarcastic entries
combined_entries = {**selected_entries_sarcastic, **selected_entries_non_sarcastic}

# Saving the selected entries to a new JSON file
with open(folder + 'M_data.json', 'w') as f:
    json.dump(combined_entries, f, indent=4)

In [15]:
## Creating the Mixed Dataset
# Comment: here we sample from the already created Male and Female datasets, to ensure the utterences selected are as similar as possible between the datasets. 

# Set seed for reproducibility
random.seed(42)

# Loading the male and female datasets
with open(folder + 'M_data.json', 'r') as f:
    male_data = json.load(f)

with open(folder + 'F_data.json', 'r') as f:
    female_data = json.load(f)

# Filtering male entries
sarcastic_male = {key: value for key, value in male_data.items() if value.get('sarcasm') == True}
non_sarcastic_male = {key: value for key, value in male_data.items() if value.get('sarcasm') == False}

# Filtering female entries
sarcastic_female = {key: value for key, value in female_data.items() if value.get('sarcasm') == True}
non_sarcastic_female = {key: value for key, value in female_data.items() if value.get('sarcasm') == False}

# Randomly sampling 92 entries from each filtered group
selected_sarcastic_male = dict(random.sample(list(sarcastic_male.items()), 92))
selected_non_sarcastic_male = dict(random.sample(list(non_sarcastic_male.items()), 92))
selected_sarcastic_female = dict(random.sample(list(sarcastic_female.items()), 92))
selected_non_sarcastic_female = dict(random.sample(list(non_sarcastic_female.items()), 92))

# Combining all selected entries into one dictionary
mixed_entries = {
    **selected_sarcastic_male,
    **selected_non_sarcastic_male,
    **selected_sarcastic_female,
    **selected_non_sarcastic_female
}

# Saving the mixed dataset to a new JSON file
with open(folder + 'Mixed_data.json', 'w') as f:
    json.dump(mixed_entries, f, indent=4)

In [16]:
# Function to load JSON data, analyze it, and print the results
def analyze_json(file_path):
    # Getting the dataset name from the file path
    dataset_name = file_path.split('.')[0]  # Extracts the base name without extension
    
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Checking for duplicate keys
    unique_keys = set(data.keys())
    has_duplicates = len(unique_keys) < len(data)  # If unique keys are fewer than total keys, duplicates exist

    total_entries = len(data)
    sarcastic_male_count = sum(1 for value in data.values() if value.get('gender') == 'M' and value.get('sarcasm') == True)
    non_sarcastic_male_count = sum(1 for value in data.values() if value.get('gender') == 'M' and value.get('sarcasm') == False)
    sarcastic_female_count = sum(1 for value in data.values() if value.get('gender') == 'F' and value.get('sarcasm') == True)
    non_sarcastic_female_count = sum(1 for value in data.values() if value.get('gender') == 'F' and value.get('sarcasm') == False)

    # Printing the results directly in the function
    print(f"{dataset_name}:")
    print(f"Total entries: {total_entries}")
    print(f"Male Sarcastic: {sarcastic_male_count}")
    print(f"Male Non-Sarcastic: {non_sarcastic_male_count}")
    print(f"Female Sarcastic: {sarcastic_female_count}")
    print(f"Female Non-Sarcastic: {non_sarcastic_female_count}")
    print(f"Contains duplicates: {has_duplicates}\n")

# Analyzing each dataset
analyze_json(folder + 'M_data.json')
analyze_json(folder + 'F_data.json')
analyze_json(folder + 'Mixed_data.json')

data/M_data:
Total entries: 368
Male Sarcastic: 184
Male Non-Sarcastic: 184
Female Sarcastic: 0
Female Non-Sarcastic: 0
Contains duplicates: False

data/F_data:
Total entries: 368
Male Sarcastic: 0
Male Non-Sarcastic: 0
Female Sarcastic: 184
Female Non-Sarcastic: 184
Contains duplicates: False

data/Mixed_data:
Total entries: 368
Male Sarcastic: 92
Male Non-Sarcastic: 92
Female Sarcastic: 92
Female Non-Sarcastic: 92
Contains duplicates: False



In [17]:
# Creating the training, validation and test datasets

# Female dataset
female_sarcastic_utterances = [(key, value) for key, value in female_data.items() if value['sarcasm']]
female_non_sarcastic_utterances = [(key, value) for key, value in female_data.items() if not value['sarcasm']]

# Splitting the data into 70% training, 15% validation and 15% test
FS_train, FS_test_val = train_test_split(female_sarcastic_utterances, test_size = 0.3)
FS_test, FS_val = train_test_split(FS_test_val, test_size = 0.5)

FnS_train, FnS_test_val = train_test_split(female_non_sarcastic_utterances, test_size = 0.3)
FnS_test, FnS_val = train_test_split(FnS_test_val, test_size = 0.5)

train_set_F = dict(FS_train + FnS_train)
val_set_F = dict(FS_val + FnS_val)
test_set_F = dict(FS_test + FnS_test)

# Male dataset
male_sarcastic_utterances = [(key, value) for key, value in male_data.items() if value['sarcasm']]
male_non_sarcastic_utterances = [(key, value) for key, value in male_data.items() if not value['sarcasm']]

# Splitting the data into 70% training, 15% validation and 15% test
MS_train, MS_test_val = train_test_split(male_sarcastic_utterances, test_size = 0.3)
MS_test, MS_val = train_test_split(MS_test_val, test_size = 0.5)

MnS_train, MnS_test_val = train_test_split(male_non_sarcastic_utterances, test_size = 0.3)
MnS_test, MnS_val = train_test_split(MnS_test_val, test_size = 0.5)

train_set_M = dict(MS_train + MnS_train)
val_set_M = dict(MS_val + MnS_val)
test_set_M = dict(MS_test + MnS_test)

# Mixed dataset
mixed_F_sarcastic_utterances = [(key, value) for key, value in mixed_entries.items() if value['gender'] == 'F' and value['sarcasm']]
mixed_F_non_sarcastic_utterances = [(key, value) for key, value in mixed_entries.items() if value['gender'] == 'F' and not value['sarcasm']]
mixed_M_sarcastic_utterances = [(key, value) for key, value in mixed_entries.items() if value['gender'] == 'M' and value['sarcasm']]
mixed_M_non_sarcastic_utterances = [(key, value) for key, value in mixed_entries.items() if value['gender'] == 'M' and not value['sarcasm']]

# Splitting the data into 70% training, 15% validation and 15% test
mixedS_F_train, mixedS_F_test_val = train_test_split(mixed_F_sarcastic_utterances, test_size = 0.3)
mixedS_F_test, mixedS_F_val = train_test_split(mixedS_F_test_val, test_size = 0.5)

mixednS_F_train, mixednS_F_test_val = train_test_split(mixed_F_non_sarcastic_utterances, test_size = 0.3)
mixednS_F_test, mixednS_F_val = train_test_split(mixednS_F_test_val, test_size = 0.5)

mixedS_M_train, mixedS_M_test_val = train_test_split(mixed_M_sarcastic_utterances, test_size = 0.3)
mixedS_M_test, mixedS_M_val = train_test_split(mixedS_M_test_val, test_size = 0.5)

mixednS_M_train, mixednS_M_test_val = train_test_split(mixed_M_non_sarcastic_utterances, test_size = 0.3)
mixednS_M_test, mixednS_M_val = train_test_split(mixednS_M_test_val, test_size = 0.5)

train_set_mixed = dict(mixedS_F_train + mixednS_F_train + mixedS_M_train + mixednS_M_train)
val_set_mixed = dict(mixedS_F_val + mixednS_F_val + mixedS_M_val + mixednS_M_val)
test_set_mixed = dict(mixedS_F_test + mixednS_F_test + mixedS_M_test + mixednS_M_test)

# Saving the training, validation and test datasets to new JSON files
with open(folder + 'train_M.json', 'w') as f:
    json.dump(train_set_M, f, indent=4)

with open(folder + 'val_M.json', 'w') as f:
    json.dump(val_set_M, f, indent=4)

with open(folder + 'test_M.json', 'w') as f:
    json.dump(test_set_M, f, indent=4)

with open(folder + 'train_F.json', 'w') as f:
    json.dump(train_set_F, f, indent=4)

with open(folder + 'val_F.json', 'w') as f:
    json.dump(val_set_F, f, indent=4)

with open(folder + 'test_F.json', 'w') as f:
    json.dump(test_set_F, f, indent=4)

with open(folder + 'train_mixed.json', 'w') as f:
    json.dump(train_set_mixed, f, indent=4)

with open(folder + 'val_mixed.json', 'w') as f:
    json.dump(val_set_mixed, f, indent=4)

with open(folder + 'test_mixed.json', 'w') as f:
    json.dump(test_set_mixed, f, indent=4)

In [18]:
# Analyzing each dataset
analyze_json(folder + 'train_M.json')
analyze_json(folder + 'val_M.json')
analyze_json(folder + 'test_M.json')

analyze_json(folder + 'train_F.json')
analyze_json(folder + 'val_F.json')
analyze_json(folder + 'test_F.json')

analyze_json(folder + 'train_mixed.json')
analyze_json(folder + 'val_mixed.json')
analyze_json(folder + 'test_mixed.json')

data/train_M:
Total entries: 256
Male Sarcastic: 128
Male Non-Sarcastic: 128
Female Sarcastic: 0
Female Non-Sarcastic: 0
Contains duplicates: False

data/val_M:
Total entries: 56
Male Sarcastic: 28
Male Non-Sarcastic: 28
Female Sarcastic: 0
Female Non-Sarcastic: 0
Contains duplicates: False

data/test_M:
Total entries: 56
Male Sarcastic: 28
Male Non-Sarcastic: 28
Female Sarcastic: 0
Female Non-Sarcastic: 0
Contains duplicates: False

data/train_F:
Total entries: 256
Male Sarcastic: 0
Male Non-Sarcastic: 0
Female Sarcastic: 128
Female Non-Sarcastic: 128
Contains duplicates: False

data/val_F:
Total entries: 56
Male Sarcastic: 0
Male Non-Sarcastic: 0
Female Sarcastic: 28
Female Non-Sarcastic: 28
Contains duplicates: False

data/test_F:
Total entries: 56
Male Sarcastic: 0
Male Non-Sarcastic: 0
Female Sarcastic: 28
Female Non-Sarcastic: 28
Contains duplicates: False

data/train_mixed:
Total entries: 256
Male Sarcastic: 64
Male Non-Sarcastic: 64
Female Sarcastic: 64
Female Non-Sarcastic: 64