#### Install Dependencies

In [33]:
!pip install sklearn
!pip install torch
!pip install transformers



#### Load Necesary Imports

In [34]:
import json
from sklearn.metrics import accuracy_score
import torch
from transformers import (BertTokenizerFast,
                          BertForSequenceClassification,
                          DistilBertTokenizerFast,
                          DistilBertForSequenceClassification,
                          Trainer, TrainingArguments)

#### Parameters

In [35]:
# Model Parameters.
modelName = "distilbert-base-uncased"  # Other models: https://huggingface.co/models.
tokenizerType =  DistilBertTokenizerFast  # Set the tokenizer type for the model.
taskType = DistilBertForSequenceClassification  # Set the correct task type.
targets = ['0', '1']  # '0' for not sarcastic.  '1' for sarcastic.

# Dataset Parameters.
trainFileName = "/content/data/training.json"  # Location for training set.
validFileName = "/content/data/validation.json"  # Location for validation set.
testFileName = "/content/data/testing.json"  # Location for testing set.
maxLength = 128  # Max length for each comment.
trainSampling, validSampling, testSampling = 10000, 5000, 2500  # Sampling sizes.
sampling = True  # A boolean denoting whether to sample the dataset.
useParentComments = True  # A boolean denoting whether to use parent comments.

# Training Parameters.
outputDirectory = "./results"  # The location of the results of training.
loggingDirectory = "./logs"  # The location of the logs.
epochs = 2  # The number of epochs to train on.
trainBatch = 16  # The size of a training batch.
evalBatch = 20  # The size of a validation batch.
warmupSteps = 500  # The number of warm-up steps.
weightDecay = 0.01  # The value of the weight decay.
loggingSteps = 20000  # The number of logging steps.
saveSteps = 20000  # The number of save steps.
loadBestModel = True  # Denoting whether we load the best model after training.
evaluationStrategy = "steps"  # The evaluation strategy for the trainer.

# Saving Model Parameters.
modelDirectory = "./sarcasm"  # Where to save the model after training.

#### Load The Data Set

In [None]:
def parseDataset(fileName):
    """
    Takes as input a fileName of a json file, then opens the
    file and returns three lists for the parent and child comments
    and labels for sarcastic or not sarcastic.
    """
    parentText, childText, labels = [], [], []  # Instantiate containers.

    # Open the training data and convert it to a json list.
    with open(fileName, 'r') as json_file:
        jsonl = list(json_file)

    # Loop through all elements in the json list.
    for dataEntry in jsonl:
        data = json.loads(dataEntry)  # Load the dictionary.

        # Construct the parent, child, and label
        # lists that will be returned.
        parentText.append(data["parent"])
        childText.append(data["child"])
        labels.append(int(data["label"][0]))

    # Return the data with the parent comment
    # if the flag is set to True.
    if useParentComments:
        return labels, parentText, childText

    return labels, childText  # Return the data without the parent comment.


# Initialize training, validation, and testing sets.
trainData = parseDataset(trainFileName)
validData = parseDataset(validFileName)
testData = parseDataset(testFileName)

#### Sample the Data for Testing

In [None]:
# Check if sampling is set to True.
if sampling:

    # Sample training data, validation, and testing data.
    trainData = [field[:trainSampling] for field in trainData]
    validData = [field[:validSampling] for field in validData]
    testData = [field[:testSampling] for field in testData]

#### Tokenize the Data Sets and Convert to a PyTorch Friendly Format

In [None]:
# Instantiate the tokenizer for the model.
tokenizer = tokenizerType.from_pretrained(modelName, do_lower_case=True)

class SarcasmDataset(torch.utils.data.Dataset):
    """
    A class to tokenize and convert the datasets
    above to PyTorch friendly formats.
    """

    def __init__(self, dataset):
        """The init function used to initialize the encodings and labels."""
        self.encodings, self.labels = self.encode(dataset[1:]), dataset[0]

    def encode(self, dataset):
        """Takes as input the dataset and encodes the comments."""

        # Tokenize the dataset, truncate or pad to the maxLength,
        # then return the tokenized data of the comments.
        return tokenizer(*dataset, truncation=True,
                         padding=True, max_length=maxLength)

    def __getitem__(self, idx):
        """
        Takes as input an id and returns that item.
        """
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item  # Return the item.

    def __len__(self):
        """Return the length of the labels."""
        return len(self.labels)


# Tokenize and convert the datsets into torch Datasets.
trainData = SarcasmDataset(trainData)
validData = SarcasmDataset(validData)

#### Load the Bert Model

In [None]:
# Load the model using the model specified above and pass the value to CUDA.
model = taskType.from_pretrained(modelName, num_labels=len(targets)).to("cuda")

#### Construct the Trainer to train the model.

In [None]:
def computeMetrics(prediction):
    """
    Takes in a prediction and computes the accuracy
    using the "accuracy_score" function from sklearn.
    """
    predictions = prediction.predictions.argmax(-1)
    return {'accuracy': accuracy_score(prediction.label_ids, predictions)}


# Initialize the training arguments.
trainingArguments = TrainingArguments(output_dir=outputDirectory,
                                      num_train_epochs=epochs,
                                      per_device_train_batch_size=trainBatch,
                                      per_device_eval_batch_size=evalBatch,
                                      warmup_steps=warmupSteps,
                                      weight_decay=weightDecay,
                                      logging_dir=loggingDirectory,
                                      load_best_model_at_end=loadBestModel,
                                      logging_steps=loggingSteps,
                                      save_steps=saveSteps,
                                      evaluation_strategy=evaluationStrategy)

# Initialize the trainer.
trainer = Trainer(model=model, args=trainingArguments, train_dataset=trainData,
                  eval_dataset=validData, compute_metrics=computeMetrics)

#### Train the Model

In [None]:
trainer.train()  # Train the model.

#### Evaluate the Model

In [None]:
trainer.evaluate()  # Evaluate the model.

#### Save the Model & Tokenizer

In [None]:
model.save_pretrained(modelDirectory)  # Save the fine-tuned model.
tokenizer.save_pretrained(modelDirectory)  # Save the tokenizer.

#### Reload the Model

In [None]:
# Reload the saved model and tokenizer.
model = taskType.from_pretrained(modelDirectory, num_labels=len(targets)).to("cuda")
tokenizer = tokenizerType.from_pretrained(modelDirectory)

In [None]:
def predict(*args):
    """
    Takes as input a string, text, then predicts if
    text is sarcastic (1) or not sarcastic (0).
    """

    # Tokenize the text, then run the input
    # through the model and take the argmax
    # to get a probability.
    inputs = tokenizer(*args, padding=True, truncation=True,
                       max_length=maxLength, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)

    # Return whether the text is sarcastic (1)
    # or not sarcastic (0).
    return probs.argmax()

In [None]:
accuracy = 0  # A value to hold the number of correctly predicted comments.

# Loop for all the data in the test set
# and compute the accuracy for the test set.
for value in zip(testData[0], *testData[1:]):
    accuracy += (predict(*value[1:]) == value[0])

print(f"The testing accuracy is {accuracy} / "
      f"{4} = {accuracy / 4}.")
print(f"There are {3} sarcastic samples in the test set.")