In [8]:
import random
import pandas as pd
from nltk.corpus import wordnet
import nltk
import spacy
nlp = spacy.load("en_core_web_sm")


In [9]:
# Importing the sarcasm data textual representation

file_path = "sarcasm_data.csv"

# Loading the CSV file into a DataFrame

df_text = pd.read_csv(file_path)

In [10]:
df_text.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm,gender
0,It's just a privilege to watch your mind at work.,SHELDON,['I never would have identified the fingerprin...,"['LEONARD', 'SHELDON']",BBT,True,M
1,I don't think I'll be able to stop thinking ab...,PENNY,['This is one of my favorite places to kick ba...,"['HOWARD', 'PENNY', 'HOWARD', 'HOWARD', 'HOWAR...",BBT,True,F
2,"Since it's not bee season, you can have my epi...",SHELDON,"['Here we go. Pad thai, no peanuts.', 'But doe...","['LEONARD', 'HOWARD', 'LEONARD']",BBT,False,M
3,"Lois Lane is falling, accelerating at an initi...",SHELDON,['A marathon? How many Superman movies are the...,"['PENNY', 'SHELDON', 'PENNY', 'SHELDON', 'SHEL...",BBT,False,M
4,I'm just inferring this is a couch because the...,SHELDON,"[""Great Caesar's ghost, look at this place."", ...","['SHELDON', 'LEONARD', 'SHELDON', 'SHELDON', '...",BBT,True,M


In [11]:
INCLUDED_POS_TAGS = {"VERB", "NOUN", "ADJ"}  # Include only verbs and nouns

def get_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def replace_with_synonyms(sentence):
    doc = nlp(sentence)
    new_sentence = []

    for token in doc:
        if token.text.isalpha() and not token.is_stop and token.pos_ in INCLUDED_POS_TAGS:
            
            synonyms = [token.text]  # Starting with the original word
            
            # Finding synonyms from WordNet
            word_synonyms = get_synonyms(token.text)

            if word_synonyms and len(word_synonyms) > 1:
                
                # Selecting the second synonym. The first is often the word itself, whilst later synonyms are further away concerning similarity.
                synonym = word_synonyms[1]

                word_nlp = nlp(token.text)
                synonym_nlp = nlp(synonym)

                # Checking the similarity between the original word and the proposed synonym

                similarity = word_nlp.similarity(synonym_nlp)

                # If the similarity is greater then a set threshold of 0.6, we replace it in a new utterance

                if(similarity > 0.6):
                    new_sentence.append(synonym)
                else:
                    new_sentence.append(token.text)
            else:
                new_sentence.append(token.text)
        else:
            new_sentence.append(token.text)

    return ' '.join(new_sentence)


# Augmenting the DataFrame with our new utterances
augmented_data = []
for index, row in df_text.iterrows():
    augmented_utterance = replace_with_synonyms(row['utterance'])
    augmented_data.append({
        'utterance': augmented_utterance,
        'speaker': row['speaker'],
        'context': row['context'],
        'context_speakers': row['context_speakers'],
        'show': row['show'],
        'sarcasm': row['sarcasm']
    })

# Merging our original data witht he augmented DataFrame
augmented_df = pd.DataFrame(augmented_data)

combined_df = pd.concat([df_text, augmented_df], ignore_index=True)

# Testing how the synonym replacement worked

index_to_compare = 4 # Modify here if you want to look at different utterences


original_utterance = combined_df.loc[index_to_compare, 'utterance']
augmented_utterance = combined_df.loc[len(df_text) + index_to_compare, 'utterance']

print("Original Utterance:", original_utterance)
print("Augmented Utterance:", augmented_utterance)


  similarity = word_nlp.similarity(synonym_nlp)


Original Utterance: I'm just inferring this is a couch because the evidence suggests the coffee table is having a tiny garage sale.
Augmented Utterance: I 'm just inferring this is a couch because the evidence suggests the coffee table is having a diminutive garage sale .


In [12]:
combined_df.head()
print(len(combined_df))

1266


In [13]:
combined_df.drop_duplicates(subset=['utterance'], inplace=True)
print(len(combined_df))

1242


In [14]:
combined_df.to_csv('enriched_utterances.csv', index=False)