In [1]:
import pandas as pd

In [2]:
# Importing the sarcasm data textual representation

file_path = "sarcasm_data.json"

# Loading JSON file into a DataFrame

df_text = pd.read_json(file_path)
df_text = df_text.transpose()

In [3]:
df_text.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm
160,It's just a privilege to watch your mind at work.,SHELDON,[I never would have identified the fingerprint...,"[LEONARD, SHELDON]",BBT,True
170,I don't think I'll be able to stop thinking ab...,PENNY,[This is one of my favorite places to kick bac...,"[HOWARD, PENNY, HOWARD, HOWARD, HOWARD, PENNY,...",BBT,True
180,"Since it's not bee season, you can have my epi...",SHELDON,"[Here we go. Pad thai, no peanuts., But does i...","[LEONARD, HOWARD, LEONARD]",BBT,False
190,"Lois Lane is falling, accelerating at an initi...",SHELDON,[A marathon? How many Superman movies are ther...,"[PENNY, SHELDON, PENNY, SHELDON, SHELDON, PENN...",BBT,False
1105,I'm just inferring this is a couch because the...,SHELDON,"[Great Caesar's ghost, look at this place., So...","[SHELDON, LEONARD, SHELDON, SHELDON, SHELDON, ...",BBT,True


In [4]:
df_text.shape

(690, 6)

In [5]:
df_text['speaker'].unique()

array(['SHELDON', 'PENNY', 'HOWARD', 'LEONARD', 'RAJ', 'PERSON',
       'BERNADETTE', 'AMY', 'PERSON3', 'PERSON1', 'CHANDLER', 'ROSS',
       'MONICA', 'JOEY', 'RACHEL', 'PHOEBE', 'DOROTHY', 'ROSE',
       'MEMBER-GIRL', 'MODERATOR', 'MEMBER-BOY'], dtype=object)

In [6]:
df_text['speaker']

160      SHELDON
170        PENNY
180      SHELDON
190      SHELDON
1105     SHELDON
          ...   
2169    CHANDLER
2235    CHANDLER
234     CHANDLER
2608    CHANDLER
2524    CHANDLER
Name: speaker, Length: 690, dtype: object

***Comment:***
As for our project it is essential for us to be able to label the different extracts by gender, we will remove all entries spoken by an ambiguous gender (such as 'Person1' or 'Moderator'). We remarked that this will remove 57 utternaces, which we deem acceptable for the size of our dataset.  

In [7]:
df_filtered = df_text[df_text['speaker'] != 'PERSON']
df_filtered = df_filtered[df_filtered['speaker'] != 'PERSON1']
df_filtered = df_filtered[df_filtered['speaker'] != 'PERSON3']
df_filtered = df_filtered[df_filtered['speaker'] != 'MODERATOR']
df_filtered.shape

(633, 6)

In [8]:
# Creating a dictionnary to label the different speakers according to their gender

gender_mapping = {
    'SHELDON': 'M',
    'PENNY': 'F',
    'HOWARD': 'M',
    'LEONARD': 'M',
    'RAJ': 'M',
    'BERNADETTE': 'F',
    'AMY': 'F',
    'CHANDLER': 'M',
    'ROSS': 'M',
    'MONICA': 'F',
    'JOEY': 'M',
    'RACHEL': 'F',
    'PHOEBE': 'F',
    'DOROTHY': 'F',
    'ROSE': 'F',
    'MEMBER-GIRL': 'F',
    'MEMBER-BOY': 'M',
}

# Defining a function to be able to map the speakers to their corresponding gender

def map_gender(speaker):
    return gender_mapping.get(speaker, 'Unknown')

# Adding a new column 'gender' based on the column 'speaker'
df_filtered['gender'] = df_filtered['speaker'].apply(map_gender)


In [9]:
df_filtered.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm,gender
160,It's just a privilege to watch your mind at work.,SHELDON,[I never would have identified the fingerprint...,"[LEONARD, SHELDON]",BBT,True,M
170,I don't think I'll be able to stop thinking ab...,PENNY,[This is one of my favorite places to kick bac...,"[HOWARD, PENNY, HOWARD, HOWARD, HOWARD, PENNY,...",BBT,True,F
180,"Since it's not bee season, you can have my epi...",SHELDON,"[Here we go. Pad thai, no peanuts., But does i...","[LEONARD, HOWARD, LEONARD]",BBT,False,M
190,"Lois Lane is falling, accelerating at an initi...",SHELDON,[A marathon? How many Superman movies are ther...,"[PENNY, SHELDON, PENNY, SHELDON, SHELDON, PENN...",BBT,False,M
1105,I'm just inferring this is a couch because the...,SHELDON,"[Great Caesar's ghost, look at this place., So...","[SHELDON, LEONARD, SHELDON, SHELDON, SHELDON, ...",BBT,True,M


In [10]:
# Save the filtered data to a CSV file
df_filtered.to_csv('sarcasm_data.csv', index=False) 