The purpose of this script is prepare the raw text data for ingestion by the LSTM model. There are several steps.
First, the raw data is read into two lists, the original and compressed test (\_read_leg_compressions)
In the extraction phase:

 - the text is tokenzised
 - a vocabulary file is built
 - labels are derived
 
 
The first step of the LSTM model is to lookup word2vec embeddings for the text inputs. Rather than provide the model with all of the word2vec embeddings (which are around 3.5GB when held in memory), we extract the vocabulary of the text inputs and only provide the embeddings associated with that vocabulary.

The Word2Vec model can be downloaded from <https://code.google.com/archive/p/word2vec/>

In [None]:
import csv
import pickle
import spacy
import json
import collections

In [None]:
nlp = spacy.load('en_core_web_md', disable=['parser'])

In [None]:
from gensim.models import KeyedVectors, Word2Vec

In [None]:
import collections
import codecs
import tensorflow as tf

In [None]:
# Function to read the inputs

def _read_leg_compressions(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
        full_sents = []
        compressed = []
        for i in data:
            full_sents.append(data[i]['full_text'])
            compressed.append(data[i]['compressed_text'])
        return full_sents, compressed

In [None]:
# Function to process the raw text inputs

def extract(full_sents, compressed, text_out, labels_out, vocab_out, text_whole):
    with open(text_out, "w", encoding="utf-8") as text_out:
        with open(labels_out, "w", encoding="utf-8") as labels_out:
            with open(text_whole, "w", encoding="utf-8") as text_whole:
                
                vocab = []
                vocab.append("<unk>")
                vocab.append("</S>")
                vocab.append("</l>")
                vocab.append("<l>")
                bad_parse_count = 0
                bad_inputs = []
                
                for i in range(len(full_sents)):
                    text_data = []
                    label_data = []
                    trace = 0
                    doc_whole = nlp(full_sents[i])
                    doc_compressed = nlp(compressed[i])
                    for token in doc_whole:
                        text_data.append(token.text)
                        vocab.append(token.text)
                        if trace < len(doc_compressed):
                            if token.text == doc_compressed[trace].text:
                                trace += 1
                                label_data.append("1")
                            else:
                                label_data.append("0")
                        else:
                            label_data.append("0")

                    # Check if labels is empty... If it is, see if it's because the first word of the compressed form is not 
                    # capitalised in the original form. If that's the case, check against the lowercase form of the first word 
                    # of the compression

                    if '1' not in label_data:
                        if (doc_compressed[0] not in doc_whole) and (doc_compressed[0].lower_ in doc_whole.text):
                            label_data = []
                            trace = 0
                            for token in doc_whole:
                                if trace == 0:
                                    if token.text == doc_compressed[0].lower_:
                                        trace += 1
                                        label_data.append("1")
                                    else:
                                        label_data.append("0")
                                elif 0 < trace < len(doc_compressed):
                                    if token.text == doc_compressed[trace].text:
                                        trace += 1
                                        label_data.append("1")
                                    else:
                                        label_data.append("0")
                                else:
                                    label_data.append("0")

                    # If labels is still empty, move on (but count it as a missed)

                    if '1' in label_data:
                        text_out.write(" ".join(text_data) + "\n")
                        labels_out.write(" ".join(label_data) + "\n")
                        text_whole.write(doc_whole.text + "\n")
                    else:
                        bad_parse_count += 1
                        print(bad_parse_count)
                        bad_inputs.append(full_sents[i])
                        print("Bad index:")
                        print(i)
                    if len(label_data) != len(text_data):
                        print("Bugger")
                        print(i)
                counter = collections.Counter(vocab)
                with open(vocab_out, "w", encoding="utf-8") as vocab_out:
                    for item in counter.keys():
                        vocab_out.write(item + "\n")
    return bad_inputs

In [None]:
# Read and extract train, val and test sets (plus targetted train and val sets)

full_sents, compressed = _read_leg_compressions("") # Data path
extract(full_sents, compressed, "leg_train_text.txt", 
        "leg_train_label.txt", "leg_train_vocab.txt", "leg_train_original.txt")

full_sents, compressed = _read_leg_compressions("") # Data path
extract(full_sents, compressed, "leg_val_text.txt", 
        "leg_val_label.txt", "leg_val_vocab.txt", "leg_val_original.txt")

full_sents, compressed = _read_leg_compressions("") # Data path
extract(full_sents, compressed, "leg_test_text.txt", 
        "leg_test_label.txt", "leg_test_vocab.txt", "leg_test_original.txt")

full_sents, compressed = _read_leg_compressions("") # Data path
extract(full_sents, compressed, "leg_train_targetted_text.txt", 
        "leg_train_targetted_label.txt", "leg_train_targetted_vocab.txt", "leg_train_targetted_original.txt")

full_sents, compressed = _read_leg_compressions("") # Data path
extract(full_sents, compressed, "leg_val_targetted_text.txt", 
        "leg_val_targetted_label.txt", "leg_val_targetted_vocab.txt", "leg_val_targetted_original.txt")

In [None]:
# For convenience, combine all vocab files.

vocab_files = ["leg_train_vocab.txt", 
               "leg_val_vocab.txt",
               "leg_test_vocab.txt",
               "leg_train_targetted_vocab.txt",
               "leg_val_targetted_vocab.txt"]

total_vocab = []
for file in vocab_files:
    with open(file, 'r', encoding='utf-8') as f:
        for word in f:
            total_vocab.append(word)

counter = collections.Counter(total_vocab)

with open("total_vocab.txt", 'w', encoding='utf-8') as f:        
    for item in counter.keys():
        f.write(item)

In [None]:
# Load the word2vec model

word2vec_path = "" # Path to word2vec model
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# Loop through the data vocab and look-up embedding for each word. If an embedding does not exist, add to out_of_vocab list
# (for tracking purposes only). Write embeddings to embedding file. Also write embeddings vocab.

out_of_vocab = []
with open("total_vocab.txt", "r", encoding="utf-8") as file:
    with open("leg_embeddings.txt", "w", encoding="utf-8") as e_file:
        with open("leg_embeddings_vocab.txt", "w", encoding="utf-8") as ev_file:
            ev_file.write("<unk>\n")
            ev_file.write("</S>\n")
            ev_file.write("</l>\n")
            ev_file.write("<l>\n")
            for line in file:
                embed = [line[:-1]]
                try:
                    for num in w2v_model[line[:-1]]:
                        embed.append(str(num))
                    embed.append("\n")
                    embed_text = " ".join(embed)
                    e_file.write(embed_text)
                    ev_file.write(embed[0] + "\n")
                except KeyError:
                    out_of_vocab.append(line[:-1])