In [2]:
import random
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from snorkel.augmentation import transformation_function, ApplyOnePolicy, PandasTFApplier

In [3]:
# nltk.download("wordnet", quiet=True)

In [4]:
def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    lemmas = set().union(*[s.lemmas() for s in wn.synsets(word)])
    return list(set(l.name().lower().replace("_", " ") for l in lemmas) - {word})


@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word with a synonym."""
    words = x.text.lower().split()
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x.text = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])
        return x

In [19]:
df_train = pd.read_excel('Encoded Roel/Train.xlsx')
df_train.columns = ['text', 'Sentiment', 'Frame', 'Topic']

In [20]:
tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train)
df_train_augmented.columns = ['Text', 'Sentiment', 'Frame', 'Topic']

100%|██████████| 2549/2549 [00:03<00:00, 750.30it/s]


In [21]:
df_train_augmented

Unnamed: 0,Text,Sentiment,Frame,Topic
0,Home Solar Wind vs. Solar — Which Power So...,3.0,,renewable
0,home solar wind vs. solar — which world power ...,3.0,,renewable
0,internal solar wind vs. solar — which power so...,3.0,,renewable
1,Wind vs. Solar — Which Power Source Is Better?,3.0,,renewable
1,wind vs. solar — which power root is better?,3.0,,renewable
...,...,...,...,...
2546,geothermal power plants cost more to build tha...,3.0,economic,geothermal
2547,Electricity generated by geothermal plants is ...,4.0,economic,geothermal
2548,Want more stories about clean energy?,3.0,environmental,renewable
2548,want more stories nearly clean energy?,3.0,environmental,renewable


In [22]:
df_train_augmented.to_excel('Encoded Roel/Train - augmented.xlsx')