In [None]:
!pip install nlpaug
import nlpaug.augmenter.word as naw

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import RobertaTokenizer, RobertaForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 0
seed_everything(seed)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

# Import data

In [None]:
base_dir = '../input/step-1-create-folds'
train = pd.read_csv(f'{base_dir}/train_folds.csv')
train.head()

# Augmentation

In [None]:
actions = ['insert', 'substitute']
model_paths = ['bert-base-cased', 'distilbert-base-cased', 'roberta-base', 'xlnet-base-cased']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

models = []
for action in actions:
    for model_path in model_paths:
        print(f'Model: {model_path} - Action: {action}')
        models.append(naw.ContextualWordEmbsAug(model_path = model_path, action = action, device = 'cuda'))

In [None]:
train_augmented = []
for fold in range(5):
    print('*' * 50)
    print(f'Fold: {fold}')
    
    data = train[train['kfold'] == fold]

    for i, model in enumerate(tqdm(models)):
        data[f'excerpt_augmented_{i}'] = data['excerpt'].apply(lambda x: model.augment(x))
        
    train_augmented.append(data)

train_augmented = pd.concat(train_augmented).to_csv('train_augmented.csv', index = None)