In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

def get_prediction(outputs):
    outputs = list(outputs)
    return outputs.index(max(outputs))

  from .autonotebook import tqdm as notebook_tqdm


### Dataset

In [2]:
# LABEL = "emotions" # all 28 labels
LABEL = "emotion_category" # positive negative ambiguous and neutral

In [3]:
positive = [
    'admiration','amusement', 'approval', 'caring',
    'desire', 'excitement', 'gratitude', 'joy',
    'love', 'optimism', 'pride', 'relief'
]
negative = [
    'anger', 'annoyance', 'disappointment',
    'disapproval', 'disgust', 'embarrassment',
    'fear', 'grief', 'nervousness', 'remorse', 'sadness'
]
ambiguous = [
    'confusion', 'curiosity', 'realization', 'surprise'
]
neutral = [
    'neutral'
]
labels = positive + negative + ambiguous + neutral

mapping, mapping_category =  {}, {}
for i, lab in enumerate(labels):
    mapping[lab] = i
    if lab in positive:
        mapping_category[lab] = 3
    elif lab in negative:
        mapping_category[lab] = 2
    elif lab in ambiguous:
        mapping_category[lab] = 0
    elif lab in neutral:
        mapping_category[lab] = 1
    else:
        print("issue")

mapping.keys()

dict_keys(['admiration', 'amusement', 'approval', 'caring', 'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief', 'anger', 'annoyance', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'fear', 'grief', 'nervousness', 'remorse', 'sadness', 'confusion', 'curiosity', 'realization', 'surprise', 'neutral'])

In [5]:
path = "/Users/cha/Desktop/Code/nlp-codes/nlp-intent-classification/data/full_dataset/goemotions_1.csv"
df = pd.read_csv(path)

df["emotions"] = df[labels].idxmax(1)

df["emotion_category"] = df["emotions"].replace(mapping_category)
df["emotions"] = df["emotions"].replace(mapping)

df = df[["text", "emotions", "emotion_category"]].copy()
df.head(2)

Unnamed: 0,text,emotions,emotion_category
0,That game hurt.,22,2
1,>sexuality shouldn’t be a grouping category I...,0,3


In [6]:
df.rename({LABEL: "label"}, inplace=True, axis = 1)
df = df[["text", "label"]].copy()
df.head()

Unnamed: 0,text,label
0,That game hurt.,2
1,>sexuality shouldn’t be a grouping category I...,3
2,"You do right, if you don't care then fuck 'em!",1
3,Man I love reddit.,3
4,"[NAME] was nowhere near them, he was by the Fa...",1


In [7]:
train, test = train_test_split(df[:100], test_size=0.2, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
len(train), len(test)

(80, 20)

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_test.set_format("torch")

### Data augmentation

Considered Data augmentations :
- Characters :
    - KeyboardAug : substituute with close keyboard letter
    - RandomCharAug : insert / substitute / swap / delete char randomly
- Words :
    - WordEmbsAug : insert / substitute a word randomly by word similarity
    - TfIdfAug : same but uses TF IDF and not word2vec
    - ContextualWordEmbsAug : insert / substitute word by contextual word embedding
    - SynonymAug / AntonymAug
    - back_translation_aug
    

In [16]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# back_translation_aug = naw.BackTranslationAug(
#     from_model_name='facebook/wmt19-en-de', 
#     to_model_name='facebook/wmt19-de-en'
# )

In [17]:
augmented_train = train.copy()

In [29]:
for i in train.index:
    text, label = train["text"].loc[i], train["label"].loc[i]

In [None]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

added = pd.DataFrame({
        'text': augmented_text,
        'label': len(augmented_text)*[label]
    })

augmented_train = pd.concat([augmented_train, added])

augmented_train.tail()

In [None]:
back_translation_aug.augment(text)

### Training and evaluating the model

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=5, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=len(labels),
    args=model_args,
    use_cuda=False
) 

# Train the model
model.train_model(train)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test)

In [None]:
result

In [None]:
len(model_outputs), len(wrong_predictions), len(model_outputs) - len(wrong_predictions)

### Making predictions

In [None]:
# Make predictions with the model
predictions, raw_outputs = model.predict(["Sam was a Wizard"])
predictions, raw_outputs