In [31]:
import pandas as pd
import numpy as np
import json

In [33]:
raw_data = pd.read_csv('data/mustard++_text.csv')

# Filter out the utterances that are not labeled with sarcasm and relabel the column
sarcasm_data = raw_data[raw_data['Sarcasm'].notna()]
sarcasm_data.loc[:, 'Sarcasm'] = sarcasm_data['Sarcasm'].apply(lambda x: True if x == 0.0 else False)

# Add context data by hand
context_data = sarcasm_data.copy()
context_data['CONTEXT'] = None

for index, row in context_data.iterrows():
    scene = row['SCENE']
    key = row['KEY']
    scene_data = raw_data[raw_data['SCENE'] == scene]

    context = []
    for index_, row_ in scene_data.iterrows():
        if row_['KEY'] != key:
            context.append(row_['SENTENCE'])
    
    context_data.at[index, 'CONTEXT'] = context

# Remove columns that are not needed
cleaned_data = context_data.drop(columns=['KEY', 'END_TIME', 'Sarcasm_Type', 'Implicit_Emotion', 'Explicit_Emotion', 'Valence', 'Arousal'])
    
# Define gender of speakers and remove utterances where gender is unclear
female_speakers = ['AMY', 'PENNY','BERNADETTE', 'MONICA','DOROTHY', 'ROSE','RACHEL', 'PHOEBE', 'SOPHIA', 'MEMBER-GIRL', 'BLANCHE']
male_speakers = ['SHELDON', 'RAJ', 'HOWARD', 'LEONARD', 'STUART', 'CHANDLER', 'ROSS', 'JOEY', 'SCOTT', 'MEMBER-BOY', 'GILFOYLE','ERLICH', 'DINESH', 'JARED', 'RICHARD']
gendered_speakers = female_speakers + male_speakers
gender_data = cleaned_data[cleaned_data['SPEAKER'].isin(gendered_speakers)]
gender_data['Gender'] = np.where(gender_data['SPEAKER'].isin(female_speakers), 'F', 'M')

# Split data by gender
female_data = gender_data[gender_data['SPEAKER'].isin(female_speakers)]
male_data = gender_data[gender_data['SPEAKER'].isin(male_speakers)]

  sarcasm_data.loc[:, 'Sarcasm'] = sarcasm_data['Sarcasm'].apply(lambda x: True if x == 0.0 else False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_data['Gender'] = np.where(gender_data['SPEAKER'].isin(female_speakers), 'F', 'M')


In [34]:
gender_data.head()

Unnamed: 0,SCENE,SENTENCE,SPEAKER,SHOW,Sarcasm,CONTEXT,Gender
5,1_10004,"And of those few months, how long have you bee...",SHELDON,BBT,True,"[Well, I'm sure that, uh, you...\r\nhave a lot...",M
14,1_10009,"Let the dead man talk. So, why do you think that?",PENNY,BBT,True,"[FYI, we plan on selling out the human race ha...",F
18,1_1001,"What else? Sell it on eBay as ""slightly used.""",RAJ,BBT,True,"[Or maybe she just doesn't want to talk., Look...",M
24,1_1003,"Good idea, sit with her. Hold her, comfort her...",HOWARD,BBT,False,[It's smashed beyond repair. What are you gonn...,M
31,1_10190,"Well, now that I've given up string theory, I'...",SHELDON,BBT,True,[it's important to the story that my boobs be ...,M


In [35]:
print("number of utterances in the raw data", raw_data.shape[0])
print("number of relevant utterances", sarcasm_data.shape[0])
print("number of gendered utterances", gender_data.shape[0])
print('')
print("number of male utterances", male_data.shape[0])
print("number of sarcastic male utterances", male_data[male_data['Sarcasm']].shape[0])
print("number of non sarcastic male utterances", male_data[male_data['Sarcasm'] == False].shape[0])
print('')
print("number of female utterances", female_data.shape[0])
print("number of sarcastic female utterances", female_data[female_data['Sarcasm']].shape[0])
print("number of non sarcastic female utterances", female_data[female_data['Sarcasm'] == False].shape[0])

number of utterances in the raw data 6041
number of relevant utterances 1202
number of gendered utterances 1109

number of male utterances 735
number of sarcastic male utterances 360
number of non sarcastic male utterances 375

number of female utterances 374
number of sarcastic female utterances 185
number of non sarcastic female utterances 189


In [41]:
# Convert data to dictionary to match MUSTARD data format
data = {}
for index, row in gender_data.iterrows():
    data[row['SCENE']] = {
        'utterance': row['SENTENCE'],
        'speaker': row['SPEAKER'],
        'context': row['CONTEXT'],
        'show': row['SHOW'],
        'sarcasm': row['Sarcasm'],
        'gender' : row['Gender']
    }

# Split data by gender
F_data = {key: entry for key, entry in data.items() if entry['gender'] == 'F'}
M_data = {key: entry for key, entry in data.items() if entry['gender'] == 'M'}

# Save data to json
with open('data/F_data.json', 'w') as f:
    json.dump(F_data, f, indent=4)

with open('data/M_data.json', 'w') as f:
    json.dump(M_data, f, indent=4)

with open('data/mixed_data.json', 'w') as f:
    json.dump(data, f, indent=4)