In [None]:
!pip install loguru
!pip install nlpaug

In [None]:
import random
import os
import logging
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from tqdm.notebook import tqdm
from loguru import logger
random.seed(13)

logger.add("commonlit_nlp_aug.log")

## Original Data 

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv", nrows=None)
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv", nrows=None)

train_aug = train[['id','excerpt','target','standard_error']].copy()
test_aug =  test[['id','excerpt']].copy()
train_aug['id'] = train_aug['id'].apply(lambda x : "aug_"+str(x))
test_aug['id'] = test_aug['id'].apply(lambda x : "aug_"+str(x))

## Loading Augmentator

In [None]:
aug_word = naw.ContextualWordEmbsAug(model_path='bert-base-cased', verbose=1)

In [None]:
class CommonLitWordAugmentor:
    """Augments sentences in paragraph with given probabilty"""
    
    def __init__(self, corpus=None):
        self.para = None
        self.corpus = None
    
    def augment(self, para, prob=0.4, verbose=False):
        sentences = para.split(".")
        count = 0
        for idx in tqdm(range(0, len(sentences))):
            randNum = random.random()
            if randNum <= prob:
                augmented_line = aug_word.augment(sentences[idx])
                if verbose:
                    logger.info(f"Original : {sentences[idx]}")
                sentences[idx] = augmented_line
                if verbose:
                    logger.info(f"Augmented : {sentences[idx]}")
                count+=1
        if verbose:
            print(f">> Augmented {count} sentences")
        return ".".join(sentences)
    
    def augment_corpus(self, corpus=None, prob=0.4, verbose=True):
        self.corpus = corpus
        augmented_paras = []
        for para in tqdm(corpus, desc="Total Corpus Augmented"):
            sentences = para.split(".")
            count = 0
            for idx in range(0, len(sentences)):
                randNum = random.random()
                if randNum <= prob:
                    augmented_line = aug_word.augment(sentences[idx])
                    if verbose:
                        logger.info(f"Sent idx {idx} | Original : {sentences[idx]}")
                    sentences[idx] = augmented_line
                    if verbose:
                        logger.info(f"Sent idx {idx} | Augmented : {sentences[idx]}")
                    count+=1
            if verbose:
                logger.info(f">> Augmented {count} sentences")
            augmented_paras.append(".".join(sentences))
        return augmented_paras

In [None]:
augmentor = CommonLitWordAugmentor()

## Generating Word Augmentations On Sentence Splits

In [None]:
train_augmented = augmentor.augment_corpus(train['excerpt'], prob=0.6)
test_augmented = augmentor.augment_corpus(test['excerpt'], prob=0.6)

train_aug['aug_excerpt'] = train_augmented
test_aug['aug_excerpt'] = test_augmented

In [None]:
cols = ['id', 'excerpt','aug_excerpt','target', 'standard_error']
train_aug[cols].head()

In [None]:
train_aug[cols].to_csv("train_word_augmented.csv", index=None)
test_aug.to_csv("test_word_augmented.csv", index=None)