<a href="https://colab.research.google.com/github/rickiepark/MLQandAI/blob/main/supplementary/q15-text-augment/synonym-replacement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 텍스트 증식을 위한 동의어 교체

In [1]:
!pip install watermark

%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -p nltk

Author: Sebastian Raschka

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

nltk: 3.8.1



In [2]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet

nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms


get_synonyms("quickly")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['quickly',
 'rapidly',
 'speedily',
 'chop-chop',
 'apace',
 'promptly',
 'quickly',
 'quick',
 'cursorily',
 'quickly']

In [4]:
# 품사 태깅을 위해
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def get_position_tags(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

get_position_tags("The cat quickly jumped over the lazy dog.")

[('The', 'DT'),
 ('cat', 'NN'),
 ('quickly', 'RB'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]

In [6]:
import nltk
from nltk.corpus import wordnet
import random

random.seed(123)


def synonym_replacement(text, num_replacement=2):

    words = nltk.word_tokenize(text)

    # 명사, 형용사 등을 태깅합니다
    pos_tags = nltk.pos_tag(words)

    # 간단한 예를 위해서 부사(RB)와 형용사(JJ)만 교체합니다
    candidates = [word for word, pos in pos_tags if pos in ['RB', 'JJ']]

    if len(candidates) < num_replacement:
        return words

    # 교체할 단어를 랜덤하게 선택합니다
    words_to_replace = random.sample(candidates, num_replacement)


    # 각 단어에 대해 동의어 중에 랜덤하게 하나를 선택합니다
    for word in words_to_replace:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            text = text.replace(word, synonym, 1)

    return text

In [7]:
text = """
The cat quickly jumped over the lazy dog.
"""

sentences = nltk.sent_tokenize(text)
augmented_sentences = [synonym_replacement(sentence) for sentence in sentences]
augmented_paragraph = ' '.join(augmented_sentences)

print(augmented_paragraph)


The cat rapidly jumped over the work-shy dog.


**원본과 증식된 텍스트 비교**

In [8]:
import difflib


d = difflib.Differ()
diff = d.compare(text.split(), augmented_paragraph.split())

print('\n'.join(diff))

  The
  cat
- quickly
+ rapidly
  jumped
  over
  the
- lazy
+ work-shy
  dog.
