In [3]:
import numpy as np
import pandas as pd

In [4]:
df1 = pd.read_csv("C:\\Users\\asus\\NLP\\TIL Dataset.csv")
df1.head(10)

Unnamed: 0,text,target
0,Mozambique Insurgency: 2017-2020 Close-up Map ...,academic interests
1,The Hierarchy Problem. What is the Hierarchy P...,academic interests
2,Digging the Root Canal. Digging the Root Canal...,academic interests
3,Turning spit and data into treasure. By the ti...,academic interests
4,Plastics. Overview\nPlastic waste is choking o...,academic interests
5,New To Motorhomes. Just bought a motorhome or ...,academic interests
6,"Health researchmade personal. Like many sites,...",academic interests
7,How to Raise Kids Who Don’t Grow Up to Be Jerk...,academic interests
8,What is Soapstone?. Soapstone: A metamorphic r...,academic interests
9,Monthly Schedule. All Events Schedule for Apri...,academic interests


In [11]:
df1[df1.target == 'travel']

Unnamed: 0,text,target
751579,Country Music Songwriters. On this page you wi...,travel
751580,"Beautiful Bordeaux, the Dordogne and La Rochel...",travel
751581,Bed and Breakfast Travel Blog. Thanksgiving Br...,travel
751582,"Kenya Safari Guide: Best Time to Go. Wildlife,...",travel
751583,Alaska Airlines Extending Mileage Plan Elite S...,travel
...,...,...
763942,"Rica Rwigamba, who is in charge of tourism at ...",travel
763943,"Since the year 2000, the people living in Akag...",travel
763944,The National Development Agency (RDB) announce...,travel
763945,1. Visiting Nyungwe Park which contains many m...,travel


In [7]:
import spacy
import random

nlp = spacy.load('en_core_web_lg')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def get_synonyms_spacy(word):
    word_vector = nlp.vocab[word].vector
    most_similar = nlp.vocab.vectors.most_similar(word_vector.reshape(1, -1), n=5)
    synonyms = [nlp.vocab.strings[similar] for similar in most_similar[0][0]]
    if word in synonyms:
        synonyms.remove(word)
    return synonyms

def synonym_replacement_spacy(text, n=2):
    doc = nlp(text)
    augmented_texts = []
    for _ in range(n):
        augmented_text = [token.text for token in doc]
        for token in doc:
            if random.random() < 0.15:
                synonyms = get_synonyms_spacy(token.text)
                if synonyms:
                    augmented_text[token.i] = random.choice(synonyms)
        augmented_texts.append(' '.join(augmented_text))
    return augmented_texts

In [9]:
def random_deletion_spacy(text, n=2, p=0.15):
    doc = nlp(text)
    augmented_texts = []
    for _ in range(n):
        augmented_text = [token.text for token in doc if random.random() > p]
        augmented_texts.append(' '.join(augmented_text))
    return augmented_texts

In [10]:
def text_rotation_spacy(text, n=2):
    augmented_texts = []
    sentences = list(nlp(text).sents)
    for _ in range(n):
        random.shuffle(sentences)
        augmented_texts.append(' '.join([sent.text for sent in sentences]))
    return augmented_texts

In [15]:
from googletrans import Translator

def back_translate(text, target_lang='ko', source_lang="en"):
    translator = Translator()
    translated_text = translator.translate(text, dest=target_lang, src=source_lang).text
    back_translated_text = translator.translate(translated_text, dest=source_lang, src=target_lang).text
    return back_translated_text

In [16]:
desired_category = 'pets'

selected_rows = df1[df1['target'] == desired_category]
augmented_texts = selected_rows['text'].apply(back_translate)

augmented_df_pets = pd.DataFrame({
    'text': augmented_texts.explode(),
    'target': desired_category
})


In [19]:
augmented_df_pets

Unnamed: 0,text,target
580741,Another new blog. The publication of the new b...,pets
580742,"According to experts, five tips on how to use ...",pets
580743,What are they carrying? Emirosom: Christian Lo...,pets
580744,How to introduce a cat to a new cat.It is exci...,pets
580745,Ring doorbell elite.8.0\nvalue\n9.3\nequipment...,pets
...,...,...
580919,Pawboost Lost and Found Pet -Find and reports ...,pets
580920,Pond Information Center -Garden Pond Resources...,pets
580921,Veterinarian and emergency veterinarian,pets
580922,Oduic like crazy!? Happy! Fun! Amazing! Daily!...,pets


In [18]:
df1[df1.target == 'pets'].count()

text      183
target    183
dtype: int64

In [1]:
import random
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def get_synonyms_nltk(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement_nltk(text, n=2):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    augmented_texts = []

    for _ in range(n):
        augmented_text = words.copy()
        for i in range(len(words)):
            if random.random() < 0.15:
                synonyms = get_synonyms_nltk(words[i])
                if synonyms:
                    augmented_text[i] = random.choice(synonyms)
        augmented_texts.append(' '.join(augmented_text))

    return augmented_texts

In [8]:
desired_category = 'travel'

selected_rows = df1[df1['target'] == desired_category]
augmented_texts = selected_rows['text'].apply(synonym_replacement_nltk)

augmented_df_travel = pd.DataFrame({
    'text': augmented_texts.explode(),
    'target': desired_category
})

In [10]:
augmented_df_travel

Unnamed: 0,text,target
751579,country music songwriters page find sites coun...,travel
751579,country music songwriters page find sites coun...,travel
751580,beautiful bordeaux dordogne la rochelle fly cr...,travel
751580,beautiful bordeaux dordogne la rochelle fly cr...,travel
751581,have sex breakfast travel blog thanksgiving br...,travel
...,...,...
763944,national development agency rdb announced augu...,travel
763945,visiting nyungwe park contains many monkey hab...,travel
763945,visiting nyungwe park check many monkey habita...,travel
763946,hosteling baby boomers many people believe hos...,travel
