In [14]:
# Importing all necessary packages

import pandas as pd
import json
import random

In [6]:
# Importing the dataset

df = pd.read_csv('mustard++_text.csv')

In [7]:
df

Unnamed: 0,SCENE,KEY,SENTENCE,END_TIME,SPEAKER,SHOW,Sarcasm,Sarcasm_Type,Implicit_Emotion,Explicit_Emotion,Valence,Arousal
0,1_10004,1_10004_c_00,"Well, I'm sure that, uh, you...\r\nhave a lot ...",0:06,PERSON,BBT,,,,,,
1,1_10004,1_10004_c_01,Who was he?,0:08,SHELDON,BBT,,,,,,
2,1_10004,1_10004_c_02,His name is Ron.\r\nI met him at my prayer group.,0:12,PERSON,BBT,,,,,,
3,1_10004,1_10004_c_03,How long have you been involved with him?,0:14,SHELDON,BBT,,,,,,
4,1_10004,1_10004_c_04,A few months.,0:16,PERSON,BBT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3_S06E06_143,3_S06E06_143_u,I thought that was the company policy-these days.,0:4.459000,GILFOYLE,SV,1.0,ILL,Frustration,Neutral,4.0,7.0
6037,3_S06E07_272,3_S06E07_272_c_0,When Richard told me about the dots last night...,0:08.708000,GILFOYLE,SV,,,,,,
6038,3_S06E07_272,3_S06E07_272_c_1,But a few hours later when I woke up in a stal...,0:13.917000,GILFOYLE,SV,,,,,,
6039,3_S06E07_272,3_S06E07_272_c_2,I realized something.,0:15.708000,GILFOYLE,SV,,,,,,


In [9]:
# Preparing the gender classification for the dataset

female_speakers = ['AMY', 'PENNY','BERNADETTE', 'MONICA','DOROTHY', 'ROSE','RACHEL', 'PHOEBE', 'SOPHIA', 'MEMBER-GIRL', 'BLANCHE']

male_speakers = ['SHELDON', 'RAJ', 'HOWARD', 'LEONARD', 'STUART', 'CHANDLER', 'ROSS', 'JOEY', 'SCOTT', 'MEMBER-BOY', 'GILFOYLE','ERLICH', 'DINESH', 'JARED', 'RICHARD']


In [12]:
## Formatting the dataset in the same way as in our original project

# Initializing the dictionary for the final JSON structure
json_data = {}

# Processing each row independently to create the JSON structure
for index, row in df.iterrows():
    # Creating an entry for each utterance
    entry = {
        "utterance": row['SENTENCE'],
        "speaker": row['SPEAKER'],
        "context": [],                # We have no context information in this dataset (idea: same scene means context)
        "context_speakers": [],       # We have no context information in this dataset (idea: same scene means context)
        "show": row['SHOW'],
        "sarcasm": bool(row['Sarcasm']) if pd.notnull(row['Sarcasm']) else False,
        "gender": "F" if row['SPEAKER'] in female_speakers else "M" # Currently we also have "Person", might have to drop them from dataset as Gender is unknown. 
    }
    
    # Using the value in the 'KEY' column as the primary key for this JSON entry
    json_data[row['KEY']] = entry

# Converting to a JSON format
json_output = json.dumps(json_data, indent=4)

# Exporting to a JSON file
with open('mustard++_formatted.json', 'w') as f:
    f.write(json_output)


In [13]:
## Dropping all data where speaker is "PERSON", as the gender is unkown. 

# Loading the JSON data from the file
with open('mustard++_formatted.json', 'r') as f:
    data = json.load(f)

# Filtering out entries where the speaker is "PERSON"
filtered_data = {key: value for key, value in data.items() if value['speaker'] != "PERSON"}

# Saving the filtered data back to a new JSON file
with open('mustard++_formatted_and_filtered.json', 'w') as f:
    json.dump(filtered_data, f, indent=4)


### Summary to prepare for splitting of data

*Female Utterances*
> Sarcastic:        185
> Non-Sarcastic:    189


*Male Utterances*
> Sarcastic:        360
> Non-Sarcastic:    375

**Female Only Model:**
> 184 Sarcastic Female, 184 Non-Sarcastic Female

**Male Only Model:**
> 184 Sarcastic Male, 184 Non-Sarcastic Male

**Mixed Model:**
> 92 Sarcastic Male, 92 Non-Sarcastic Male, 92 Sarcastic Female, 92 Non-Sarcastic Female

In [16]:
## Creating the Female Dataset

# Loading the formatted and filtered JSON data 
with open('mustard++_formatted_and_filtered.json', 'r') as f:
    data = json.load(f)

# Filtering entries where gender is "F" and sarcasm is TRUE
filtered_entries_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "F" and value.get('sarcasm') == True}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_sarcastic = dict(random.sample(list(filtered_entries_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_sarcastic = filtered_entries_sarcastic

# Filtering entries where gender is "F" and sarcasm is FALSE
filtered_entries_non_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "F" and value.get('sarcasm') == False}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_non_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_non_sarcastic = dict(random.sample(list(filtered_entries_non_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_non_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_non_sarcastic = filtered_entries_non_sarcastic

# Combining the selected sarcastic and non-sarcastic entries
combined_entries = {**selected_entries_sarcastic, **selected_entries_non_sarcastic}

# Saving the selected entries to a new JSON file
with open('F_data.json', 'w') as f:
    json.dump(combined_entries, f, indent=4)

In [17]:
## Creating the Male dataset

# Loading the formatted and filtered JSON data 
with open('mustard++_formatted_and_filtered.json', 'r') as f:
    data = json.load(f)

# Filtering entries where gender is "M" and sarcasm is TRUE
filtered_entries_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "M" and value.get('sarcasm') == True}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_sarcastic = dict(random.sample(list(filtered_entries_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_sarcastic = filtered_entries_sarcastic

# Filtering entries where gender is "M" and sarcasm is FALSE
filtered_entries_non_sarcastic = {key: value for key, value in data.items() if value.get('gender') == "M" and value.get('sarcasm') == False}

# Ensuring that there are enough entries for sampling
if len(filtered_entries_non_sarcastic) >= 184:
    # Randomly selecting 184 entries
    selected_entries_non_sarcastic = dict(random.sample(list(filtered_entries_non_sarcastic.items()), 184))
else:
    print(f"Only {len(filtered_entries_non_sarcastic)} entries meet the criteria. Selecting all available entries.")
    selected_entries_non_sarcastic = filtered_entries_non_sarcastic

# Combining the selected sarcastic and non-sarcastic entries
combined_entries = {**selected_entries_sarcastic, **selected_entries_non_sarcastic}

# Saving the selected entries to a new JSON file
with open('M_data.json', 'w') as f:
    json.dump(combined_entries, f, indent=4)


In [18]:
## Creating the Mixed Dataset
# Comment: here we sample from the already created Male and Female datasets, to ensure the utterences selected are as similar as possible between the datasets. 

# Loading the male and female datasets
with open('M_data.json', 'r') as f:
    male_data = json.load(f)

with open('F_data.json', 'r') as f:
    female_data = json.load(f)

# Filtering male entries
sarcastic_male = {key: value for key, value in male_data.items() if value.get('sarcasm') == True}
non_sarcastic_male = {key: value for key, value in male_data.items() if value.get('sarcasm') == False}

# Filtering female entries
sarcastic_female = {key: value for key, value in female_data.items() if value.get('sarcasm') == True}
non_sarcastic_female = {key: value for key, value in female_data.items() if value.get('sarcasm') == False}

# Randomly sampling 92 entries from each filtered group
selected_sarcastic_male = dict(random.sample(list(sarcastic_male.items()), 92))
selected_non_sarcastic_male = dict(random.sample(list(non_sarcastic_male.items()), 92))
selected_sarcastic_female = dict(random.sample(list(sarcastic_female.items()), 92))
selected_non_sarcastic_female = dict(random.sample(list(non_sarcastic_female.items()), 92))

# Combining all selected entries into one dictionary
mixed_entries = {
    **selected_sarcastic_male,
    **selected_non_sarcastic_male,
    **selected_sarcastic_female,
    **selected_non_sarcastic_female
}

# Saving the mixed dataset to a new JSON file
with open('Mixed_data.json', 'w') as f:
    json.dump(mixed_entries, f, indent=4)

In [20]:
# Function to load JSON data, analyze it, and print the results
def analyze_json(file_path):
    # Getting the dataset name from the file path
    dataset_name = file_path.split('.')[0]  # Extracts the base name without extension
    
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Checking for duplicate keys
    unique_keys = set(data.keys())
    has_duplicates = len(unique_keys) < len(data)  # If unique keys are fewer than total keys, duplicates exist

    total_entries = len(data)
    sarcastic_male_count = sum(1 for value in data.values() if value.get('gender') == 'M' and value.get('sarcasm') == True)
    non_sarcastic_male_count = sum(1 for value in data.values() if value.get('gender') == 'M' and value.get('sarcasm') == False)
    sarcastic_female_count = sum(1 for value in data.values() if value.get('gender') == 'F' and value.get('sarcasm') == True)
    non_sarcastic_female_count = sum(1 for value in data.values() if value.get('gender') == 'F' and value.get('sarcasm') == False)

    # Printing the results directly in the function
    print(f"{dataset_name}:")
    print(f"Total entries: {total_entries}")
    print(f"Male Sarcastic: {sarcastic_male_count}")
    print(f"Male Non-Sarcastic: {non_sarcastic_male_count}")
    print(f"Female Sarcastic: {sarcastic_female_count}")
    print(f"Female Non-Sarcastic: {non_sarcastic_female_count}")
    print(f"Contains duplicates: {has_duplicates}\n")

# Analyzing each dataset
analyze_json('M_data.json')
analyze_json('F_data.json')
analyze_json('Mixed_data.json')

M_data:
Total entries: 368
Male Sarcastic: 184
Male Non-Sarcastic: 184
Female Sarcastic: 0
Female Non-Sarcastic: 0
Contains duplicates: False

F_data:
Total entries: 368
Male Sarcastic: 0
Male Non-Sarcastic: 0
Female Sarcastic: 184
Female Non-Sarcastic: 184
Contains duplicates: False

Mixed_data:
Total entries: 368
Male Sarcastic: 92
Male Non-Sarcastic: 92
Female Sarcastic: 92
Female Non-Sarcastic: 92
Contains duplicates: False

