In [None]:
pip install pyvi

In [None]:
# Filter data that need to augment 

# Filter label 1,2 - train on ViHSD dataset

# Dataset analysis
import pandas as pd 

DATA = 'drive/My Drive/CODE/HSD/dataset/train.csv'
DATA_HATE = 'drive/My Drive/CODE/HSD/dataset/aug/data_label_1(6).csv'

data = pd.read_csv(DATA, index_col=False)

label1 = data.loc[data['label_id']==1]
# label2 = data.loc[data['label_id']==2]

# data_new = pd.concat([label1, label2])
data_new = pd.concat([label1])

print(data_new)
data_new.to_csv(DATA_HATE, header=False, index=False, sep="|")

# Text augmentation using EDA techniques


In [None]:
# Change these arguments to fit with your own data / project

class Argument:
    input = "drive/My Drive/CODE/HSD/dataset/aug/data_label_1(6).csv"
    output = "drive/My Drive/CODE/HSD/dataset/aug/augmented_dataset(6).txt"
    num_aug = 8
    alpha = 0.15


args = Argument()

In [None]:
# CODE augmetation: https://github.com/jasonwei20/eda_nlp

import random
from random import shuffle

random.seed(1)
import json


# stop words list
stop_words = []
with open("drive/My Drive/CODE/HSD/vietnamese-stopwords.txt", "r") as f:
    stop_words = []
    for line in f:
        dd = line.strip('\n')
        stop_words.append(dd)

# cleaning up text
import re


def get_only_chars(line):
    # clean_line = ""

    # line = line.replace("’", "")
    # line = line.replace("'", "")
    # line = line.replace("-", " ") #replace hyphens with spaces
    # line = line.replace("\t", " ")
    # line = line.replace("\n", " ")
    # line = line.lower()

    # for char in line:
    #     if char in 'qwertyuiopasdfghjklzxcvbnm ':
    #         clean_line += char
    #     else:
    #         clean_line += ' '

    # clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    # if clean_line[0] == ' ':
    #     clean_line = clean_line[1:]
    # return clean_line
    return line


########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

# for the first time you use wordnet
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet


def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            # print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break

    # this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words


# def get_synonyms(word):
# 	synonyms = set()
# 	for syn in wordnet.synsets(word):
# 		for l in syn.lemmas():
# 			synonym = l.name().replace("_", " ").replace("-", " ").lower()
# 			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
# 			synonyms.add(synonym)
# 	if word in synonyms:
# 		synonyms.remove(word)
# 	return list(synonyms)

def get_synonyms(word):
    synonyms = set()
    with open("drive/My Drive/CODE/HSD/word_net_vi.json", "r") as f:
        wordnet = json.load(f)

    for key, value in wordnet.items():
        if key.strip() == word:
            for v in value:
                synonyms.add(v.strip())

        if word in synonyms:
            synonyms.remove(word)
    return list(synonyms)


########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):
    # obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    # randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    # if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words) - 1)
        return [words[rand_int]]

    return new_words


########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        if len(new_words) > 0:
            new_words = swap_word(new_words)
    return new_words


def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words


########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words


def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and len(new_words) > 0:
    # while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words) - 1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    
    if len(new_words) > 0:
        random_synonym = synonyms[0]
        random_idx = random.randint(0, len(new_words) - 1)
        new_words.insert(random_idx, random_synonym)


########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    sentence = get_only_chars(sentence)
    words = sentence.split(' ')
    words = [word for word in words if word is not '']
    num_words = len(words)

    augmented_sentences = []

    if len(words) <= 0:
        return augmented_sentences
    num_new_per_technique = int(num_aug / 4) + 1
    n_sr = max(1, int(alpha_sr * num_words))
    n_ri = max(1, int(alpha_ri * num_words))
    n_rs = max(1, int(alpha_rs * num_words))

    # sr
    for _ in range(num_new_per_technique):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))

    # ri
    for _ in range(num_new_per_technique):
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(' '.join(a_words))

    # rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(' '.join(a_words))

    # rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(' '.join(a_words))

    augmented_sentences = list(set(augmented_sentences))
    augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    # trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    # append the original sentence
    augmented_sentences.append(sentence)

    return augmented_sentences


# the output file
output = None
if args.output:
    output = args.output
else:
    from os.path import dirname, basename, join

    output = join(dirname(args.input), 'eda_' + basename(args.input))

# number of augmented sentences to generate per original sentence
num_aug = 9  # default
if args.num_aug:
    num_aug = args.num_aug

# how much to change each sentence
alpha = 0.1  # default
if args.alpha:
    alpha = args.alpha


# generate more data with standard augmentation
def gen_eda(train_orig, output_file, alpha, num_aug=9):
    try:
        writer = open(output_file, 'w')
        lines = open(train_orig, 'r').readlines()

        writer.write("free_text" + "," + "label_id" + '\n')
        augm = ""
        for i, line in enumerate(lines):
            try:
                parts = line[:-1].split('|')
                # print(parts)
                # sen_id = parts[0]
                label = parts[1]
                sentence = parts[0]
                aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
                for aug_sentence in aug_sentences:
                    # writer.write(label + "\t" + aug_sentence + '\n')
                    # writer.write(sen_id + "," +aug_sentence + "," + label + '\n')
                    augm = augm + aug_sentence + "," + label + '\n'
            except Exception as e:
                print(e)
                print(parts)
                pass

        writer.write(augm)
        writer.close()
        print(
            "generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(
                num_aug))
    except Exception as e:
        raise e
        pass


generated augmented sentences with eda for drive/My Drive/CODE/HSD/dataset/aug/data_label_1(6).csv to drive/My Drive/CODE/HSD/dataset/aug/augmented_dataset(6).txt with num_aug=8


## Main code. Run the cell below to generate new texts 

In [None]:
# main function. Run this cell to generate new data 
if __name__ == "__main__":
    # generate augmented sentences and output into a new file
    gen_eda(args.input, args.output, alpha=alpha, num_aug=num_aug)

# Results and concat with original 

In [None]:
# TRAIN
# concat train original with original

import pandas as pd

DATA = 'drive/My Drive/CODE/HSD/dataset/train.csv'
DATA_AUG = 'drive/My Drive/CODE/HSD/dataset/aug/augmented_dataset(3).txt'

DATA_AUG_FINAL = 'drive/My Drive/CODE/HSD/dataset/aug/train_augmented_dataset(4).csv'

data = pd.read_csv(DATA, index_col=False)
data_hate = pd.read_csv(DATA_AUG, index_col=False, error_bad_lines=False)

# data_hate = data_hate.iloc[: , 1:]
data_hate.drop_duplicates(subset ="free_text", keep = False, inplace = True)

data_aug = pd.concat([data, data_hate])

data_aug.to_csv(DATA_AUG_FINAL, index=False)

In [None]:
# Extra augmentation
import pandas as pd

DATA = 'drive/My Drive/CODE/HSD/dataset/train.csv'

DATA_AUG = 'drive/My Drive/CODE/HSD/dataset/aug/augmented_dataset(3).txt'
DATA_AUG_2 = 'drive/My Drive/CODE/HSD/dataset/aug/augmented_dataset(6).txt'

DATA_AUG_FINAL = 'drive/My Drive/CODE/HSD/dataset/aug/train_augmented_dataset(6).csv'

data_hate = pd.read_csv(DATA_AUG, index_col=False, error_bad_lines=False)
data_hate_2 = pd.read_csv(DATA_AUG_2, index_col=False, error_bad_lines=False)

data_hate_final = pd.concat([data_hate, data_hate_2])
data_hate_final.drop_duplicates(subset ="free_text", keep = False, inplace = True)

data = pd.read_csv(DATA, index_col=False)
data_aug = pd.concat([data, data_hate_final])

data_aug.to_csv(DATA_AUG_FINAL, index=False)