In [20]:
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#!pip install pyTsetlinMachineCUDA spacy pycuda unidecode
#!python3 -m spacy download en_core_web_sm
from PyTsetlinMachineCUDA.tm import MultiClassTsetlinMachine
from time import time
import random

import spacy
from tqdm import tqdm

import os, sys, inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from data_handling.data_tokenization import get_dataset_for_category

In [21]:
def get_train_test_dataset_for_category(category, num_examples=255, subpart_size=512, subpart_overlap=26):
    data_source = "../cuad_json/CUADv1.json"
    num_examples = num_examples # 510 is max
    subpart_size = subpart_size
    subpart_overlap = subpart_overlap
    data_destination = f"datasets/binary_dataset_{subpart_size}_{subpart_overlap}_{num_examples}.json"
    vocab_destination = f"vocabs/vocab_{num_examples}.json"
    category = category

    # Save everything the Spacy tokenizer gives us
    tokenize = spacy.load("en_core_web_sm")

    train_dataset, test_dataset, tokenizer, vocab_to_idx = get_dataset_for_category(category, data_source, data_destination, vocab_destination, num_examples, subpart_size, subpart_overlap, tokenize)
    
    return train_dataset, test_dataset, tokenizer, vocab_to_idx

category = "termination_for_convenience" # One of the categories which FastText had the highest soft doc accuracies with
num_docs = 40
subpart_size = 512 # 512 tokens per chunk
subpart_overlap = 26 # 26 tokens overlaps between each chunk

train_dataset, test_dataset, tokenizer, vocab_to_idx = get_train_test_dataset_for_category(category, num_docs, subpart_size, subpart_overlap)

Creating vocabulary: 100%|██████████| 40/40 [01:07<00:00,  1.68s/it]
Creating Token to Index Mapping:   4%|▍         | 15443/389616 [00:00<00:00, 2053080.50it/s]
Creating subparts and labels:   5%|▌         | 2/40 [00:00<00:02, 15.07it/s]

No existing file with same configuration, creating new file....


Creating subparts and labels: 100%|██████████| 40/40 [00:02<00:00, 14.64it/s]


Ratio: 0.4
Pos train: 18
Neg train: 22
Batch size: 13


Ratio: The ratio between negative and positive docs. This ratio is what is used for weighing the nr of positive and negative chunks in the training batches
Just ignore the pos train and neg train
Batch size: Nr of chunks per batch. There are *num_docs* batches generated with *batch_size* nr of chunks per batch. Resulting in a total number of chunks: *num_docs* * *batch_size*, which in this case is: 40 * 13 = 520 training chunks.

In [22]:
print("Nr of batches in the training dataset:", len(train_dataset))
print("Batchlen of batches in the training dataset:", len(train_dataset[0]))
print("Nr of tokens in a chunk:", len(train_dataset[0][0]["subpart"]))
print("Nr of labels per chunk:", train_dataset[0][3]["label"])

Nr of batches in the training dataset: 40
Batchlen of batches in the training dataset: 13
Nr of tokens in a chunk: 512
Nr of labels per chunk: 1


In [23]:
# Not sure if shuffling of the data is important for the Tsetlin Machine, but here it is atleast
for i in range(len(train_dataset)):
    random.shuffle(train_dataset[i])

# Structure train_dataset

So the structure of the train_dataset is:

`len(train_dataset) = nr of batches`

`len(train_dataset[0]) = nr of chunks in batch (batch_size)`

`train_dataset[0][0] = dict("subpart": list(tokens) , "label": int)`


For the test_dataset it is a little different because it is document based.

In [24]:
for idx, i in enumerate(test_dataset):
    print("'Batch size' in doc:", idx + 1, "is:", len(i["labels"]))

'Batch size' in doc: 1 is: 7
'Batch size' in doc: 2 is: 9
'Batch size' in doc: 3 is: 28
'Batch size' in doc: 4 is: 9
'Batch size' in doc: 5 is: 32
'Batch size' in doc: 6 is: 62
'Batch size' in doc: 7 is: 51
'Batch size' in doc: 8 is: 20
'Batch size' in doc: 9 is: 13
'Batch size' in doc: 10 is: 8
'Batch size' in doc: 11 is: 23
'Batch size' in doc: 12 is: 36
'Batch size' in doc: 13 is: 12


As we can see here, the structure is somewhat different. Instead of containing a list of dict("subpart": list(token), "label": int) it is rather a dict of lists.

test_dataset[0] = dict("labels": list(label), "subparts": list(list(token)))

Please ask if something is unclear!

Below I've just pasted in the method I'm using for getting the data ready for the Tsetlin Machine in regards of bitwise transformation and so on.

In [25]:
id_to_word = {value: key for key, value in vocab_to_idx.items()}

In [26]:
train_data = []
for document in train_dataset:
    for chunk in document:
        train_data.append((chunk["label"], chunk["subpart"]))

print(len(train_data))
pos = 0
neg = 0
for i in train_data:
    if i[0] == 1:
        pos += 1
    else:
        neg += 1
print(pos)
print(neg)

520
240
280


In [27]:
vocabulary = {}

MAX_NGRAM = 2

for i in train_data:
    terms = []
    for word_id in i[1]:
        terms.append(id_to_word[word_id])

    for N in range(1, MAX_NGRAM + 1):
        grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
        for gram in grams:
            phrase = " ".join(gram)

            if phrase in vocabulary:
                vocabulary[phrase] += 1
            else:
                vocabulary[phrase] = 1
                    
print(len(vocabulary))

51565


In [28]:
phrase_bit_nr = {}
bit_nr_phrase = {}
bit_nr = 0
for phrase in vocabulary.keys():

    phrase_bit_nr[phrase] = bit_nr
    bit_nr_phrase[bit_nr] = phrase
    bit_nr += 1
print(len(phrase_bit_nr))

51565


In [29]:
#######################################################################################
################################ TRAINING #############################################
#######################################################################################

X_train = np.zeros((len(train_data), len(phrase_bit_nr)), dtype=np.uint32)
Y_train = np.zeros(len(train_data), dtype=np.uint32)

for i, item in tqdm(enumerate(train_data), total=len(train_data), desc="Creating bit representation training data"):
    terms = []
    for word_id in item[1]:
        terms.append(id_to_word[word_id])
    
    for N in range(1, MAX_NGRAM + 1):
        grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
        for gram in grams:
            phrase = " ".join(gram)
            if phrase in phrase_bit_nr:
                X_train[i, phrase_bit_nr[phrase]] += 1

    Y_train[i] = item[0]

Creating bit representation training data: 100%|██████████| 520/520 [00:01<00:00, 300.19it/s]


In [30]:
#######################################################################################
################################ TESTING ##############################################
#######################################################################################


print("Creating bit representations for the test documents...")

test_chunks = []
test_labels = []

for _, document in tqdm(enumerate(test_dataset), total=len(test_dataset), desc="Creating bit representation test data"):
    terms = []
    X_test = np.zeros((len(document["labels"]), len(phrase_bit_nr)), dtype=np.uint32)
    for i in range(len(document["labels"])):
        for word_id in document["subparts"][i]:
            terms.append(id_to_word[word_id])
            
        for N in range(1, MAX_NGRAM + 1):
            grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
            for gram in grams:
                phrase = " ".join(gram)
                if phrase in phrase_bit_nr:
                    X_test[i, phrase_bit_nr[phrase]] += 1

    Y_test = np.asarray(document["labels"])
    test_chunks.append(X_test)
    test_labels.append(Y_test)
        
print("Finished creating bit representations for the test documents..")

Creating bit representation test data:  15%|█▌        | 2/13 [00:00<00:01, 10.92it/s]

Creating bit representations for the test documents...


Creating bit representation test data: 100%|██████████| 13/13 [00:18<00:00,  1.39s/it]

Finished creating bit representations for the test documents..





In [31]:
print("Selecting features...")

FEATURES = 20_000

SKB = SelectKBest(chi2, k=FEATURES)
SKB.fit(X_train, Y_train)

selected_features = SKB.get_support(indices=True)
FEATURES = len(selected_features)
X_train = SKB.transform(X_train)

for doc in test_chunks:
    doc = SKB.transform(doc)

Selecting features...
