#### Imports

In [51]:
import csv
import json
import random
import re

#### Parameters for Data Split

In [52]:
# The training, validation,
# and tesing split.
splits = [0.7, 0.9, 1]

# Variables to find the average length
# of parent and child comments.
averageParentLength = 0
averageChildLength = 0

In [53]:
class Preprocesser(object):
    """A class to preprocess text."""

    def __init__(self, punctuation=True, url=True, number=True):
        """The init function for the Preprocessor object."""

        # Initialize the flags for the class
        # to tell what tranformations should occur.
        self.punctuation = punctuation
        self.url = url
        self.number = number

    def apply(self, text):
        """A function to apply the desired transformations to the text."""
        text = self._lowercase(text)

        # Check the flags and then execute
        # the proper text tranformations.
        if self.url:
            text = self._remove_url(text)

        if self.punctuation:
            text = self._remove_punctuation(text)

        if self.number:
            text = self._remove_number(text)

        return re.sub(r'\s+', ' ', text)  # Return the processed text.

    def _remove_punctuation(self, text):
        """A function to remove all the punctuations in the text."""
        return re.sub(r'[^\w\s]', '', text)  # Remove all punctuation.

    def _remove_url(self, text):
        """A function to remove all the urls in the text."""
        return re.sub(r'http\S+', '', text)  # This removes urls from text.

    def _remove_number(self, text):
        """A function to remove all the numbers in the text."""
        return re.sub(r'[0-9]+', '', text)  # This removes numbers from text.

    def _lowercase(self, text):
        """A function to lowercase the text."""
        return text.lower()  # This makes text lowercase.

#### Parse the Data

In [54]:
data_clean = []  # Container to store all clean data.
data_unclean = []  # Container to store all unclean data.
filename = "train-balanced-sarcasm.csv"  # Filename.
processer = Preprocesser()  # Instantiate the preprocessor.

# Open the file specified by the filename
# and parse the data.
with open(filename, 'r') as csvfile:
    rows = csv.reader(csvfile)  # Get all rows.

    # Loop for each row and extract
    # necesary data from each row.
    for index, row in enumerate(rows):

        # Skip the first row as this
        # contains column labels.
        if index == 0:
            continue

        # Extract the preprocessed data and append to the data list.
        parent = processer.apply(row[9])
        child = processer.apply(row[1])

        # Code to find the length of the
        # average parent and child comments.
        averageParentLength += len(parent.split(' '))
        averageChildLength += len(child.split(' '))

        # Construct the clean data dictionary.
        data_clean.append({"parent": parent,
                     "child": child,
                     "label": [row[0]]})

        # Construct the unclean data dictionary.
        data_unclean.append({"parent": row[9],
                "child": row[1],
                "label": [row[0]]})

    zipped = list(zip(data_clean, data_unclean))
    random.shuffle(zipped)  # Shuffle the data.
    data_clean, data_unclean = zip(*zipped)  # Unpack the data.

    # Print number of datapoints and
    # the average lengths nicely.
    print(f"There are {index} samples in the data set.")
    print(f"The average parent comment length is {averageParentLength / index}")
    print(f"The average child comment length is {averageChildLength / index}")

There are 1010826 samples in the data set.
The average parent comment length is 24.039907956463328
The average child comment length is 10.363371143995108


#### Add the data to Training, Validation, and Testing Sets

In [55]:
def splitData(data):
    """
    Takes as input data fgrom above and splits it
    into training, validation, and testing sets.
    """

    # Lists for the training, validation,
    # and testing data.  The length of the
    # data and the thresholds for splitting
    # the data.
    dataSplits = [[] for _ in range(3)]
    total = len(data)
    thresholds = [total * p for p in splits]

    # Loop for all datapoints and place them
    # into the correct set, i.e., either
    # training, validation, or testing.
    for index, datapoint in enumerate(data):
        if index < thresholds[0]:
            dataSplits[0].append(datapoint)
        elif index < thresholds[1]:
            dataSplits[1].append(datapoint)
        else:
            dataSplits[2].append(datapoint)
    
    # Return the training, validation, and testing sets.
    return dataSplits

# Split the clean and unclean data.
dataSplits_clean = splitData(data_clean)
dataSplits_unclean = splitData(data_unclean)

#### Save the Datasets as jsons

In [56]:
def saveData(dataSplits, fileNames):
    """
    Takes as input a list of three filesNames, in a list,
    fileNames, and a data split, dataSplits, then saves
    training, validation, and testing files as JSONL files.
    """

    # Loop for three data sets that need to be created.
    for dataSplit, split in zip(dataSplits, fileNames):
        # Open a new file to save, named as specified.
        with open(f"./{split}.jsonl", "w") as file:

            # Loop for each entry to be saved in the JSON file.
            for datapoint in dataSplit:
                json.dump(datapoint, file)  # Save each entry.
                file.write("\n")  # pad each entry with a '\n'.

# Define the clean and unclean fileNames.
fileNames_clean = ["training_clean", "validation_clean", "testing_clean"]
fileNames_unclean = ["training_unclean", "validation_unclean", "testing_unclean"]

# Save the clean and unclean data splits.
saveData(dataSplits_clean, fileNames_clean)
saveData(dataSplits_unclean, fileNames_unclean)

NameError: name 'fileNames_' is not defined