The purpose of this script is to split the raw legislation compressions dataset into train, validation and test sets. The order of the legislation compression dataset has not be randomised before now, so we do so here. The script also defines the split for the 'targetted' train and validation sets.

In [1]:
import csv
import numpy as np
import pickle

In [2]:
input_file = 'legislative_compressions_tsv.txt'

In [3]:
# Read the data into a dictionary

def _read_leg_compressions(filename):
    with open(filename, newline='') as tabfile:
        count = 0
        full_data = {}
        legReader = csv.reader(tabfile, delimiter = '\t', quotechar = '"', quoting=csv.QUOTE_MINIMAL)
        next(legReader, None)
        for row in legReader:
            reference = row[0] # legislative reference
            full_text = row[1] # uncompressed text
            compressed_text = row[2] # compressed text
            target = row[3] # entry part of targetted sub-set
            fragment = row[4] # fragment or whole sentence
            concat = row[5] # stand alone provision or concatenation
            if full_text[-1] == "\n":
                full_text = full_text[:-1]
            if compressed_text[-1] == "\n":
                compressed_text = compressed_text[:-1]
            full_data[count] = {
                'reference': reference,
                'full_text': full_text,
                'compressed_text': compressed_text,
                'target': target,
                'fragment': fragment,
                'concat': concat
            }
            count += 1
        return full_data

In [4]:
full_data = _read_leg_compressions(input_file)

In [5]:
# Used to get some random indexes

idx = list(range(len(full_data)))
np.random.shuffle(idx)

In [6]:
# Train, val and test split

train_data = idx[0:750]
val_data = idx[750:850]
test_data = idx[850:]

train_dict = {}
val_dict = {}
test_dict = {}

for i in train_data:
    train_dict[i] = full_data[i]
for i in val_data:
    val_dict[i] = full_data[i]
for i in test_data:
    test_dict[i] = full_data[i]

In [8]:
# Write pickle outputs

train_file = "leg_train_data.pickle"
val_file = "leg_val_data.pickle"
test_file = "leg_test_data.pickle"

with open(train_file, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(train_dict, f, pickle.HIGHEST_PROTOCOL)

with open(val_file, 'wb') as f:
    pickle.dump(val_dict, f, pickle.HIGHEST_PROTOCOL)
    
with open(test_file, 'wb') as f:
    pickle.dump(test_dict, f, pickle.HIGHEST_PROTOCOL)

In [9]:
# Targetted train and val split

train_targetted_dict = {}
val_targetted_dict = {}

for i in train_data:
    if full_data[i]['target'] == "Yes":
        train_targetted_dict[i] = full_data[i]
for i in val_data:
    if full_data[i]['target'] == "Yes":
        val_targetted_dict[i] = full_data[i]

In [10]:
# Write pickle outputs

targetted_train_file = "leg_targetted_train_data.pickle"
targetted_val_file = "leg_targetted_val_data.pickle"

with open(targetted_train_file, 'wb') as f:
    pickle.dump(train_targetted_dict, f, pickle.HIGHEST_PROTOCOL)

with open(targetted_val_file, 'wb') as f:
    pickle.dump(val_targetted_dict, f, pickle.HIGHEST_PROTOCOL)