In [1]:
!pip install numpy requests nlpaug transformers sacremoses nltk

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m[31m6.6 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.1.0-py3-none-any.whl.metadata (5.7 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.21.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00

In [3]:
import nlpaug.augmenter.word as naw

###### 삽입

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]

# ContextualWordEmbsAug 클래스 : BERT 모델을 활용해 단어를 삽입하는 기능 제공
# model_path에 bert-base-uncased 나 distilbert-base-uncased를 인수로 활용해 적용하며 허깅 페이스에서 모델을 자동으로 다운로드
aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased", action="insert")
augmented_texts = aug.augment(texts)    # 리스트로 반환

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

src : Those who can imagine anything, can create the impossible.
dst : which those who not can imagine anything, can sometimes create the really impossible.
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : really we can only see a very short shoulder distance ahead, but yeah we sure can see plenty there there that needs to be urgently done.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : so if suppose a machine is usually expected to be approximately infallible, than it cannot also possibly be intelligent.
------------------


In [6]:
import nlpaug.augmenter.char as nac

### 삭제

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]

# RandomWordAug : 삽입, 대체, 교체(swap), 삭제, 자르기 기능 제공
aug = nac.RandomCharAug(action="delete")
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

src : Those who can imagine anything, can create the impossible.
dst : Ths who can mane antng, can cret the impossible.
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : We can only see a short itanc ahe, but we can see leny the tt eds to be de.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : If a mahi is expet to be infallible, it cant lo be inteent.
------------------


In [7]:
import nlpaug.augmenter.word as naw

### 교체

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]

aug = naw.RandomWordAug(action="swap")
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

src : Those who can imagine anything, can create the impossible.
dst : Those can who anything imagine, can create. the impossible
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : Can we see only a short distance, ahead can but we see plenty there that needs to be done.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : A if machine is expected to infallible be, cannot be it also intelligent.
------------------


In [8]:
import nlpaug.augmenter.word as naw

### 대체

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]

# SynonymAug : 'wordnet' 데이터베이스나 의역 데이터베이스(PPDB)를 활용해 단어를 대체
aug = naw.SynonymAug(aug_src='wordnet')
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

[nltk_data] Downloading package wordnet to /Users/seoyun/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/seoyun/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/seoyun/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


src : Those who can imagine anything, can create the impossible.
dst : Those who dismiss envisage anything, can make the impossible.
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : We can but see a brusk distance beforehand, just we derriere see plenty there that needs to comprise done.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : If a machine is have a bun in the oven to embody infallible, it cannot also represent intelligent.
------------------


In [9]:
import nlpaug.augmenter.word as naw

### 대체

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]
reserved_tokens = [
    ["can", "can't", "cannot", "could"],
]

# ReservedAug : 입력 데이터에 포함된 단어를 특정한 단어로 대체하는 기능 제공
reserved_aug = naw.ReservedAug(reserved_tokens=reserved_tokens)
augmented_texts = reserved_aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

src : Those who can imagine anything, can create the impossible.
dst : Those who cannot imagine anything, can't create the impossible.
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : We cannot only see a short distance ahead, but we cannot see plenty there that needs to be done.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : If a machine is expected to be infallible, it can't also be intelligent.
------------------


In [11]:
import nlpaug.augmenter.word as naw

### 역번역

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent.",
]

back_translation = naw.BackTranslationAug(
    from_model_name = 'facebook/wmt19-en-de',
    to_model_name   = 'facebook/wmt19-de-en'
)
augmented_texts = back_translation.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f"src : {text}")
    print(f"dst : {augmented}")
    print("------------------")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


src : Those who can imagine anything, can create the impossible.
dst : Anyone who can imagine anything can achieve the impossible.
------------------
src : We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst : We can only look a little ahead, but we can see a lot there that needs to be done.
------------------
src : If a machine is expected to be infallible, it cannot also be intelligent.
dst : If a machine is expected to be infallible, it cannot be intelligent.
------------------
