In [1]:
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#!pip install pyTsetlinMachineCUDA spacy pycuda unidecode
#!python3 -m spacy download en_core_web_sm

import sys
import random

import pandas as pd

sys.path.append("../pyTsetlinMachineParallel/")
from pyTsetlinMachineParallel.tm import MultiClassTsetlinMachine
from time import time

import spacy
from tqdm import tqdm
import json

from metrics import calculate_metrics, get_metrics

import os, sys, inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from data_handling.data_tokenization import get_dataset_for_category

In [2]:
def get_train_test_dataset_for_category(category, num_examples=255, subpart_size=256, subpart_overlap=26):
    data_source = "../data/CUADv1.json"
    num_examples = num_examples # 510 is max
    subpart_size = subpart_size
    subpart_overlap = subpart_overlap
    data_destination = f"datasets/binary_dataset_{subpart_size}_{subpart_overlap}_{num_examples}.json"
    vocab_destination = f"vocabs/vocab_{num_examples}.json"
    category = category

    # Save everything the Spacy tokenizer gives us
    tokenize = spacy.load("en_core_web_sm")

    train_dataset, test_dataset, tokenizer, vocab_to_idx = get_dataset_for_category(category, data_source, data_destination, vocab_destination, num_examples, subpart_size, subpart_overlap, tokenize)
    
    return train_dataset, test_dataset, tokenizer, vocab_to_idx


In [3]:
def get_random_local_view(X_test_example, y_test, num, local_clause_file, tm, clause_file, featureheaderset, labels_set):
    """
    X_test: Testing data
    y_test: Testing labels
    num: Number of testing examples?
    local_clause_file: Path to where we are going to store the local clauses
    tm: The Tsetlin Machine
    clause_file: The path to the global clauses
    featureheaderset: A set of all features in the Tsetlin Machine
    labels_set: A set of all labels possible in the data
    """
        
    print("Going Local")
    # Choose a random starting point for the test examples within the X_test
    # Get the nr of test examples we want
    temp_X_test=X_test_example
    print(temp_X_test.shape)
    temp_y_test=y_test
    temp_X_test_document = []
    # Loop through the chosen test examples
    print(len(temp_X_test))
    print("HCUNKS")
    for chunk in temp_X_test:
        temp_test_chunk = []
        temp_chunk = chunk
        print(chunk[:-1])
        print()
        for feature_id, feature in enumerate(temp_chunk):
            if feature >= 1:
                temp_test_chunk.append(featureheaderset[feature_id])
        temp_X_test_document.append(temp_test_chunk)    
    
    print("TEMP X TEST DOCUMENTS")
    for i in temp_X_test_document[:3]:
        print(i)
        print()
        print()
    
    if os.path.exists(local_clause_file):
        print('overwriting previous local file'+local_clause_file)
        os.remove(local_clause_file)
    fo=open(local_clause_file,'w')
    fo.write('Example Class Clause Cl.Val\n')
    fo.close()
    
    
    res = tm.predict_and_printlocal(temp_X_test, local_clause_file)

    local_clauses=pd.read_csv(local_clause_file,sep=' ')
    
    for ts in range(len(temp_X_test_document)):
        for ind, row in local_clauses.iterrows():
            if row['Example']==ts:
                print("ts:", ts)
                print("temp_X_test_document[ts]:", temp_X_test_document[ts])
                print("len(temp_X_test_document[ts])", len(temp_X_test_document[ts]))
                local_clauses.loc[local_clauses.index[ind], 'Example_BoW']=" ".join(temp_X_test_document[ts])
                local_clauses.loc[local_clauses.index[ind], 'ClassName']=labels_set[int(row['Class'])]
                
    all_clauses=pd.read_csv(clause_file,sep='\t')
                
    for ind, row in local_clauses.iterrows():
        classname=row['ClassName']
        clauseid=int(row['Clause'])
        clausetext=all_clauses[(all_clauses['ClauseNum']==clauseid) & (all_clauses['class']==classname) ]['Clause'].values
        local_clauses.loc[local_clauses.index[ind], 'ClauseText']=clausetext
        star=''
        if row['Class']==temp_y_test[row['Example']]:
            star+='Gold'
        if row['Class']==res[row['Example']]:
            star+='Predicted'
        local_clauses.loc[local_clauses.index[ind], 'CorrectLabel']=star

    local_clauses=local_clauses.sort_values(by=['Example', 'Class'])

    local_clauses.to_csv(local_clause_file, sep='\t', index=False)
    print('Local Clauses written to:'+local_clause_file)
    
    
def wrote_clauses(clause_file, NUM_FEATURES, NUM_CLAUSES, tm, featureheaderset, labels_set):
    """
    clause file: The path for where the file is supposed to be stored
    NUM_FEATURES: The number of features that are within the Tsetlin Machine
    NUM_CLAUSES: The number of clauses that are within the Tsetlin Machine
    tm: The Tsetlin Machine
    featureheaderset: A set of the features that are within Tsetlin Machine
    labels_set: A set of the labels for the data used
    """

    # Opens files and writes four column names in
    print("Writing the global clauses")
    fout_c=open(clause_file,'w')
    fout_c.write('ClauseNum\tClause\tp/n\tclass\n')
    # Creating a numpy array that is double the size of the number of features
    feature_vector=np.zeros(NUM_FEATURES*2)
    # For each label in the label_set
    for cur_cls in range(len(labels_set)):
        # For each clause in a range of the number of clauses
        for cur_clause in range(NUM_CLAUSES):
            # If the number of the current clause is even, then it is positive, and vice versa
            if cur_clause%2==0:
                clause_type='positive'
            else:
                clause_type='negative'
            # Setting the clause number
            this_clause=str(cur_clause)+'\t'
            for f in range(0,NUM_FEATURES):
                # Not sure what ta_action is, but I assume it is correct
                action_plain = tm.ta_action(int(cur_cls), cur_clause, f)
                action_negated = tm.ta_action(int(cur_cls), cur_clause, f+NUM_FEATURES)
                feature_vector[f]=action_plain
                feature_vector[f+NUM_FEATURES]=action_negated
                if action_plain==1:
                    # Adding the feature
                    this_clause+=featureheaderset[f]+';'
                #if action_negated==1:
                #    this_clause+='#'+featureheaderset[f]+';'
            # Adding the clause type and label(not sure why)
            this_clause+='\t'+clause_type+'\t'+str(labels_set[cur_cls])    
            # Write to file, and continue
            fout_c.write(str(this_clause)+'\n')
    fout_c.close()

    print('Global Clauses written at :'+ clause_file)


In [4]:
def run(category, nr_of_examples):
    
    train_dataset, test_dataset, tokenizer, vocab_to_idx = get_train_test_dataset_for_category(category, nr_of_examples)
    
    id_to_word = {value: key for key, value in vocab_to_idx.items()}
    
    #print(test_dataset[0])
    
    for idx, d in enumerate(test_dataset[0]["subparts"]):
        text = []
        for id in d:
            text.append(id_to_word[id])
        print("CHUNK:", idx, "LABEL:", test_dataset[0]["labels"][idx])
        print(" ".join(text))
        print()
        print()
    
    train_data = []
    for document in train_dataset:
        for chunk in document:
            train_data.append((chunk["label"], chunk["subpart"]))

    print("Nr of training data chunks:", len(train_data))
    pos = 0
    neg = 0
    for i in train_data:
        if i[0] == 1:
            pos += 1
        else:
            neg += 1
    print("Positive training data chunks:", pos)
    print("Negative training data chunks:", neg)
    
    vocabulary = {}

    MAX_NGRAM = 2
    
    print(f"Creating the vocabulary with a NGRAM of {MAX_NGRAM}")

    for i in train_data:
        terms = []
        for word_id in i[1]:
            terms.append(id_to_word[word_id])

        for N in range(1, MAX_NGRAM + 1):
            grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
            for gram in grams:
                phrase = " ".join(gram)

                if phrase in vocabulary:
                    vocabulary[phrase] += 1
                else:
                    vocabulary[phrase] = 1

    print("Total number of word in the vocabulary:", len(vocabulary))
    
    vocab_idx_to_word = {idx: key for idx, key in enumerate(vocabulary.keys())}
    print("len(vocab_idx_to_word):", len(vocab_idx_to_word))
    
    phrase_bit_nr = {}
    bit_nr_phrase = {}
    bit_nr = 0
    
    
    for phrase in vocabulary.keys():

        phrase_bit_nr[phrase] = bit_nr
        bit_nr_phrase[bit_nr] = phrase
        bit_nr += 1

    
    print(list(phrase_bit_nr.items())[:10])
        
    
    #######################################################################################
    ################################ TRAINING #############################################
    #######################################################################################
    
    print("Creating bit representations for the training chunks")

    X_train = np.zeros((len(train_data), len(phrase_bit_nr)), dtype=np.uint32)
    Y_train = np.zeros(len(train_data), dtype=np.uint32)

    for i, item in tqdm(enumerate(train_data), total=len(train_data), desc="Creating bit representation training data"):
        terms = []
        for word_id in item[1]:
            terms.append(id_to_word[word_id])

        for N in range(1, MAX_NGRAM + 1):
            grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
            for gram in grams:
                phrase = " ".join(gram)
                if phrase in phrase_bit_nr:
                    X_train[i, phrase_bit_nr[phrase]] += 1

        Y_train[i] = item[0]
        
    print("Finished creating bit representations for the training chunks..")
    
    #######################################################################################
    ################################ TESTING ##############################################
    #######################################################################################


    print("Creating bit representations for the test documents...")

    test_chunks = []
    test_labels = []

    for _, document in tqdm(enumerate(test_dataset), total=len(test_dataset), desc="Creating bit representation test data"):
        X_test = np.zeros((len(document["labels"]), len(phrase_bit_nr)), dtype=np.uint32)
        for i in range(len(document["labels"])):
            terms = []
            for word_id in document["subparts"][i]:
                terms.append(id_to_word[word_id])

            for N in range(1, MAX_NGRAM + 1):
                grams = [terms[j:j + N] for j in range(len(terms) - N + 1)]
                for gram in grams:
                    phrase = " ".join(gram)
                    if phrase in phrase_bit_nr:
                        #print("pbn:", phrase_bit_nr[phrase])
                        #print("phrase:", phrase)
                        X_test[i, phrase_bit_nr[phrase]] += 1

        Y_test = np.asarray(document["labels"])
        test_chunks.append(X_test)
        test_labels.append(Y_test)

    print("Finished creating bit representations for the test documents..")
    
    for id, d in enumerate(test_chunks[0]):
        text_c = 0
        for bit in d:
            if bit != 0:
                text_c += bit
        print("CHUNK:", id, "-", text_c)
            
    

    FEATURES = 10_000
    
    print(f"Selecting {FEATURES} features...")

    SKB = SelectKBest(chi2, k=FEATURES)
    SKB.fit(X_train, Y_train)

    selected_features = SKB.get_support(indices=True)
    FEATURES = len(selected_features)
    X_train = SKB.transform(X_train)
    print(X_train.shape)
    
    print("Selected features:", selected_features)
    print("len(selected_features):", len(selected_features))
    
    # Get the features
    print("Nr of ids to words:", len(id_to_word))
    features = [bit_nr_phrase[id] for id in selected_features]
    print("features:", features[:20])
    print("Number of unique features:", len(set(features)))
    labels_set = [0, 1]
    
    tmp_X_test = []
    for idx, doc in enumerate(test_chunks):
        tmp_tmp_X_test = []
        for chunk in doc:
            tmp_tmp_X_test.append(SKB.transform(chunk.reshape(1, -1)))
        tmp_tmp_X_test = np.asarray(tmp_tmp_X_test, dtype="object")
        if tmp_tmp_X_test.shape[0] == 1:
            tmp_X_test.append(tmp_tmp_X_test)
        else:
            tmp_X_test.append(tmp_tmp_X_test.squeeze())
    X_test = np.asarray(tmp_X_test, dtype="object")
    
    print("SANITY CHECK FOR DIMENSIONS IN TRAINING AND TEST")
    print("TRAINING DIMENSIONS:", X_train.shape)
    print("Number of testing documents:", len(X_test))
    print("Number of chunks for the first document:", len(X_test[0]))
    print("Number of features for the chunks in the documents:", len(X_test[0][0]))
    
    
    best_hyperparameters = {"n_clauses": 0, "t": 0, "s": 0, "soft_doc_acc": 0.0}


    n_clauses = 30
    t = 320
    s = 25

    tm = MultiClassTsetlinMachine(n_clauses, t, s, append_negated=False)

    episodes = 90
    train_epochs = 10

    for ep in range(episodes):
        start_training = time()
        tm.fit(X_train, Y_train, epochs=train_epochs, incremental=True)
        stop_training = time()

        if ep % 1 == 0:
            start_testing = time()
            acc = []
            positive_acc = []
            positive_preds = 0

            soft_doc_preds = []

            metrics = {"tp": 0, "fn": 0, "tn": 0, "fp": 0}
            for i, doc in enumerate(X_test):
                if len(doc.shape) > 2:
                    doc = doc.squeeze(0)
                preds = tm.predict(doc)
                tmp_metrics = calculate_metrics(preds, test_labels[i])
                for key, value in tmp_metrics.items():
                    metrics[key] += value

                corr_pred = 0
                ucorr_pred = 0

                for idx, chunk in enumerate(doc):
                    pred = preds[idx]
                    if pred == test_labels[i][idx]:
                        acc.append(1)
                    else:
                        acc.append(0)
                    if test_labels[i][idx] == 1 and pred == 1:
                        corr_pred += 1
                    elif test_labels[i][idx] == 1 and pred == 0:
                        ucorr_pred += 1
                if corr_pred >= ucorr_pred:
                    pos_doc_pred = True
                else:
                    pos_doc_pred = False

                if sum(test_labels[i]) >= 1 and pos_doc_pred == True:
                    soft_doc_preds.append(1)
                elif sum(test_labels[i]) >= 1 and pos_doc_pred == False:
                    soft_doc_preds.append(0)

            stop_testing = time()
            string = "#%d Accuracy: %.2f%%, Soft Doc Acc: %.2f%%, Training: %.2fs Testing: %.2fs" % (ep, round(sum(acc)/len(acc), 5) * 100, sum(soft_doc_preds)/len(soft_doc_preds) * 100, stop_training-start_training, stop_testing-start_testing)
            metrics = get_metrics(metrics)
            print(string)
            print(metrics)
            metrics_string = str(string) + json.dumps(metrics)

            with open(f"results/{n_clauses}_{t}_{s}_{category}_non_negated_test.txt", "a") as f:
                f.write(metrics_string + "\n")

            if sum(soft_doc_preds)/len(soft_doc_preds) > best_hyperparameters["soft_doc_acc"]:
                best_hyperparameters.update({"s": s, "n_clauses": n_clauses, "t": t, "soft_doc_acc": sum(soft_doc_preds)/len(soft_doc_preds)})
                print("BEST HYPERPARAMETERS:", best_hyperparameters)

    with open(f"results/BEST_HP_{n_clauses}_{t}_{s}.txt", "a") as f:
        f.write(json.dumps(best_hyperparameters) + "\n")
        
    wrote_clauses("./clauses/global_clauses.txt", len(features), n_clauses, tm, features, labels_set)
    test_example_document = X_test[0]
    if len(test_example_document.shape) > 2:
        test_example_document = X_test[1]
        test_example_labels = test_labels[1]
    else:
        test_example_labels = test_labels[0]
        #test_example_document = test_example_document.squeeze(0)
    #test_example_labels = test_labels[0]
    get_random_local_view(test_example_document, test_example_labels, 1, "./clauses/local_clause_file.txt", tm, "./clauses/global_clauses.txt", features, labels_set)

In [5]:
categories = ["expiration_date"]

n_examples = 256

for category in categories:
    run(category, n_examples)

Found existing file, loading....
Finished loading file
Found existing file, loading....
Finished loading file
Ratio: 0.7
Pos train: 209
Neg train: 47
Batch size: 30
Randomizing training and test data..
CHUNK: 0 LABEL: 0
Exhibit 10.1 COLOGUARD ® PROMOTION AGREEMENT BY AND BETWEEN EXACT SCIENCES CORPORATION AND PFIZER INC . August 21 , 2018 Source : EXACT SCIENCES CORP , 8 - K , 8/22/2018 TABLE OF CONTENTS Page 1 . DEFINITIONS 1 2 . GOVERNANCE 10 2.1 Joint Steering Committee 10 2.2 Joint Operations Committee 13 2.3 Joint Review Committee 14 2.4 Finance Representative 15 2.5 Alliance Managers 15 2.6 Compliance Managers 16 3 . APPOINTMENT ; PRODUCT OWNERSHIP ; MARKETING AND SALES 17 3.1 Appointment 17 3.2 Responsibility for Product 19 3.3 Annual Marketing Plan 26 3.4 Sales Promotion , Detailing Efforts and IDN Promotion 27 3.5 Pfizer Investment and Support 28 3.6 Exact Investment and Support 29 3.7 Changes in Shared M&P Expenses 30 4 . ACCOUNTING 30 4.1 Responsibility for Shared M&P Expens

Creating bit representation training data:   1%|          | 90/7680 [00:00<00:08, 896.18it/s]

Total number of word in the vocabulary: 149798
len(vocab_idx_to_word): 149798
[('.', 0), ('However', 1), (',', 2), ('any', 3), ('Party', 4), ('may', 5), ('terminate', 6), ('its', 7), ('participation', 8), ('in', 9)]
Creating bit representations for the training chunks


Creating bit representation training data: 100%|██████████| 7680/7680 [00:09<00:00, 821.97it/s]
Creating bit representation test data:   1%|▏         | 1/78 [00:00<00:11,  6.56it/s]

Finished creating bit representations for the training chunks..
Creating bit representations for the test documents...


Creating bit representation test data: 100%|██████████| 78/78 [00:04<00:00, 17.43it/s]


Finished creating bit representations for the test documents..
CHUNK: 0 - 326
CHUNK: 1 - 324
CHUNK: 2 - 423
CHUNK: 3 - 462
CHUNK: 4 - 456
CHUNK: 5 - 473
CHUNK: 6 - 479
CHUNK: 7 - 437
CHUNK: 8 - 413
CHUNK: 9 - 420
CHUNK: 10 - 431
CHUNK: 11 - 442
CHUNK: 12 - 469
CHUNK: 13 - 453
CHUNK: 14 - 421
CHUNK: 15 - 425
CHUNK: 16 - 415
CHUNK: 17 - 451
CHUNK: 18 - 423
CHUNK: 19 - 408
CHUNK: 20 - 428
CHUNK: 21 - 454
CHUNK: 22 - 444
CHUNK: 23 - 442
CHUNK: 24 - 420
CHUNK: 25 - 464
CHUNK: 26 - 463
CHUNK: 27 - 409
CHUNK: 28 - 438
CHUNK: 29 - 437
CHUNK: 30 - 446
CHUNK: 31 - 451
CHUNK: 32 - 422
CHUNK: 33 - 438
CHUNK: 34 - 421
CHUNK: 35 - 423
CHUNK: 36 - 436
CHUNK: 37 - 457
CHUNK: 38 - 458
CHUNK: 39 - 443
CHUNK: 40 - 429
CHUNK: 41 - 436
CHUNK: 42 - 425
CHUNK: 43 - 397
CHUNK: 44 - 445
CHUNK: 45 - 455
CHUNK: 46 - 432
CHUNK: 47 - 427
CHUNK: 48 - 454
CHUNK: 49 - 412
CHUNK: 50 - 428
CHUNK: 51 - 385
CHUNK: 52 - 431
CHUNK: 53 - 403
CHUNK: 54 - 423
CHUNK: 55 - 421
CHUNK: 56 - 431
CHUNK: 57 - 453
CHUNK: 58 - 449
CHU

# Steps

1. First get the vocabulary
2. Create the dataframe where there are # of columns as many as the vocabulary
3. Then for each chunk, count the different occurences of chunks and increment for the token

# TODO

1. Ensure that both the test and training dataset has been selected K best and so on - DONE
2. Fix the soft document accuracy for all models - DONE
3. Run all models - IN PROGRESS
4. Maybe set up the convolutional tsetlin machine again? Requires quite some work with the dataprocessing - MEH
5. Write in the overleaf document - SOON
6. 