#### Install Dependencies

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
!pip install numpy
!pip install transformers
!pip install sklearn



#### Imports

In [55]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix
from transformers import (BertTokenizerFast,
                          BertForSequenceClassification,
                          DistilBertTokenizerFast,
                          DistilBertForSequenceClassification)

#### Parameters

In [56]:
# Lists for different parameters for
# the different models to be used in
# the ensemble methods.  These lists
# are all parallel: "modelNames", "fileNames",
# "modelTypes", "taskTypes",
# "tokenizerTypes", "useParentComments",
# "clean", and "maxLengths".

modelNames = ["Bert Unclean Parent 64",
              "Bert Unclean Parent 96",
              "Bert Unclean Parent 128",
              "DistilBert clean Child 64",
              "DistilBert clean Child 96",
              "DistilBert clean Parent 64",
              "DistilBert clean Parent 96",
              "DistilBert Unclean Child 64",
              "DistilBert Unclean Child 96",
              "DistilBert Unclean Parent 20",
              "DistilBert Unclean Parent 32",
              "DistilBert Unclean Parent 64",
              "DistilBert Unclean Parent 64 Cased",
              "DistilBert Unclean Parent 96",
              "DistilBert Unclean Parent 128"]

fileNames = [f"/content/drive/MyDrive/models/{modelName}/sarcasm" for modelName in modelNames]

modelTypes = ["bert-base-uncased",
              "bert-base-uncased",
              "bert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-uncased",
              "distilbert-base-cased",
              "distilbert-base-uncased",
              "distilbert-base-uncased"]

taskTypes = [BertForSequenceClassification,
             BertForSequenceClassification,
             BertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification,
             DistilBertForSequenceClassification]

tokenizerTypes = [BertTokenizerFast,
                  BertTokenizerFast,
                  BertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast,
                  DistilBertTokenizerFast]

useParentComments = [True, True, True, False, False, True, True,
                     False, False, True, True, True, True, True, True]
clean = [False, False, False, True, True, True, True, False,
         False, False, False, False, False,  False, False]
maxLengths = [64, 96, 128, 64, 96, 64, 96, 64, 96, 20, 32, 64, 64, 96, 128]

targets = ['0', '1']  # '0' for not sarcastic.  '1' for sarcastic.

# Location for the clean and unclean testing sets.
testFileName_unclean = "/content/drive/MyDrive/data/testing_unclean.jsonl"
testFileName_clean = "/content/drive/MyDrive/data/testing_clean.jsonl"

#### Load the Testing Data for Inference

In [57]:
def parseDataset(fileName):
    """
    Takes as input a fileName of a json file, then opens the
    file and returns three lists for the parent and child comments
    and labels for sarcastic or not sarcastic.
    """
    parentText, childText, labels = [], [], []  # Instantiate containers.

    # Open the training data and convert it to a json list.
    with open(fileName, 'r') as json_file:
        jsonl = list(json_file)

    # Loop through all elements in the json list.
    for dataEntry in jsonl:
        data = json.loads(dataEntry)  # Load the dictionary.

        # Construct the parent, child, and label
        # lists that will be returned.
        parentText.append(data["parent"])
        childText.append(data["child"])
        labels.append(int(data["label"][0]))

    # Return the data with the parent comment.
    return labels, parentText, childText


# Initialize the clean and unclean testing sets.
testData_clean = parseDataset(testFileName_clean)
testData_unclean = parseDataset(testFileName_unclean)

#### Function to Make Predictions

In [58]:
def predict(*args, tokenizer, maxLength):
    """
    Takes as input a string, text, then predicts if
    text is sarcastic (1) or not sarcastic (0).
    """

    # Tokenize the text, then run the input
    # through the model and take the argmax
    # to get a probability.
    inputs = tokenizer(*args, padding=True, truncation=True,
                       max_length=maxLength, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)

    # Return whether the text is sarcastic (1)
    # or not sarcastic (0).
    return probs.argmax()

#### Load the models and Make Predictions

In [7]:
# Initialize a container to store the predictions
# for each model for each testing sample.
predictions = np.zeros((len(testData_clean[0]), len(modelTypes)))

# Zip the files, the models, the type of the
# tasks, and the tokenizer types to loop on.
# The enumerate the zipped variables.
zipped = enumerate(zip(fileNames, modelTypes, taskTypes, tokenizerTypes, maxLengths))

# Loop for the every model and load each model.
for modelIndex, (fileName, modelType, taskType, tokenizerType, maxLength) in zipped:
    print(f"model {modelIndex+1}/{len(modelTypes)}, {modelNames[modelIndex]} processing")

    # Load the saved model and tokenizer.
    model = taskType.from_pretrained(fileName,
                                     num_labels=len(targets)).to("cuda")
    tokenizer = tokenizerType.from_pretrained(fileName)

    # Loop for all the data in the
    # test set and compute the accuracy
    # for each sample for this model.
    zipped2 = zip(testData_clean[0], *testData_clean[1:], *testData_unclean[1:])
    for dataIndex, value in enumerate(zipped2):

        # Check if parent comments are enabled for this model
        # and if clean or unlcean data is being used.
        if useParentComments[modelIndex] and clean[modelIndex]:
            text = value[1:3]  # Set text to the clean parent and child comment.
        elif not useParentComments[modelIndex] and clean[modelIndex]:
            text = [value[2]]  # Set the text to the clean child comment.
        elif useParentComments[modelIndex] and not clean[modelIndex]:
            text = value[3:5]  # Set text to the unclean parent and child comment.
        else:
            text = [value[4]]  # Set the text to the unclean child comment.
        
        # Set the value in the predictions matrix.
        prediction = predict(*text, tokenizer=tokenizer, maxLength=maxLength)
        predictions[dataIndex, modelIndex] = prediction

model 1/15, Bert Unclean Parent 64 processing
model 2/15, Bert Unclean Parent 96 processing
model 3/15, Bert Unclean Parent 128 processing
model 4/15, DistilBert clean Child 64 processing
model 5/15, DistilBert clean Child 96 processing
model 6/15, DistilBert clean Parent 64 processing
model 7/15, DistilBert clean Parent 96 processing
model 8/15, DistilBert Unclean Child 64 processing
model 9/15, DistilBert Unclean Child 96 processing
model 10/15, DistilBert Unclean Parent 20 processing
model 11/15, DistilBert Unclean Parent 32 processing
model 12/15, DistilBert Unclean Parent 64 processing
model 13/15, DistilBert Unclean Parent 64 Cased processing
model 14/15, DistilBert Unclean Parent 96 processing
model 15/15, DistilBert Unclean Parent 128 processing


Save The Prediction Matrix

In [8]:
# Save the prediction matrix as a npy file.
# with open("/content/drive/MyDrive/ensemble/predictions.npy", 'wb') as file:
#     np.save(file, predictions)

Load The Prediction Matrix

In [59]:
# Load the prediciton matrix.
with open("/content/drive/MyDrive/ensemble/predictions.npy", 'rb') as file:
    predictions = np.load(file)

Calculate Confusion Matrix

In [60]:
def printConfusionMatrix(true, pred):
    """
    A function to calculate the confusion matrix
    and print precisions and recalls.
    """
    (tn, fp), (fn, tp) = confusion_matrix(true, pred)

    # Pring precissions and recalls.
    print(f"Positive Precision: {tp/(tp+fp)}")
    print(f"Negative Precision: {tn/(tn+fn)}")
    print(f"Positive Recall: {tp/(tp+fn)}")  
    print(f"Negative Recall: {tn/(tn+fp)}")

#### Calculate Accuracy

In [64]:
# Apply a weighting scheme.  This can be changed.  "weights"
# is a NumPy array that is the length of the number of models where
# the probabilities in the list sum must sum to 1.
f = 1/14
v = 0.001
weights = np.array([f-v,f-v,f+13*v,f-v,f-v,f-v,f-v,f-v,f-v,f-v,f-v,0,f-v,f-v,f-v])

# A container to store the accuracy values of the models.
accuracies = [0]*len(modelTypes)

# Calculate the accuracy of each model on its own.
for modelIndex in range(len(modelTypes)):
    for votes, label in zip(predictions, testData_clean[0]):
        accuracies[modelIndex] += (votes[modelIndex] == label)

# Loop for all the models and print the metrics for each model.
# for index, accuracy in enumerate(accuracies):
#     print({modelNames[index]})
#     print(f"The accuracy is {accuracy} / "
#           f"{len(testData_clean[0])} = {accuracy / len(testData_clean[0])}")
#     printConfusionMatrix(testData_clean[0], predictions[:, index].T)
#     print()

accuracy = 0  # A value to hold the number of correctly predicted comments.

modelPredictions = np.zeros((len(testData_clean[0]), ))  # Predictions for ensemble.
accuracy = 0  # Container for the accuracy.

# Loop for all the votes, apply the weighting scheme
# and check if the prediction is correct.
for index, (vote, label) in enumerate(zip(predictions, testData_clean[0])):
    modelPredictions[index] = (np.dot(weights, vote) > 0.5)
    accuracy += ((np.dot(weights, vote) > 0.5) == label)

# Print the accuracy of the ensemble model.
print(f"Ensembling these {len(modelTypes)} models yield an "
      f"accuracy of {accuracy} / {len(testData_clean[0])} = "
      f"{accuracy / len(testData_clean[0])}")
printConfusionMatrix(testData_clean[0], modelPredictions)

Ensembling these 15 models yield an accuracy of 80817 / 101082 = 0.7995192022318514
Positive Precision: 0.8119009010858187
Negative Precision: 0.7879991597120104
Positive Recall: 0.7808551800379027
Negative Recall: 0.8182683536270972
