In [1]:
# First load in train, valid, and test datasets
import pandas as pd
import numpy as np

from data_handling.fasttext_data_tokenization import get_dataset_for_category_fasttext
from metrics.metrics_fasttext import calculate_metrics_ft, calculate_soft_document_accuracy_ft, get_metrics_ft, calculate_chunk_and_doc_accuracy, get_preds_and_labels
from data_handling.data import DocumentData, BinaryCUADDataset

import matplotlib.pyplot as plt
import pandas as pd
import torch

import spacy

from tqdm import tqdm

import fasttext
import os
import csv
import itertools
import math
import random
import json

# Models

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

ModuleNotFoundError: No module named 'spacy'

In [None]:
def run_with_categories(categories):
    for category in categories:
        run(category)

def run(category):
    data_source = "data/CUADv1.json"
    num_examples = 255 # 510 is max
    subpart_size = 512
    subpart_overlap = 26
    data_destination = f"data/binary_dataset_fasttext_{subpart_size}_{subpart_overlap}_{num_examples}.json"
    vocab_destination = f"data/vocab_fasttext_{num_examples}.json"
    category = category
    tokenize = spacy.load("en_core_web_sm")
    num_epochs = 1_000

    train_dataset, test_dataset, tokenizer, vocab_to_idx = get_dataset_for_category_fasttext(category, data_source, data_destination, vocab_destination, num_examples, subpart_size, subpart_overlap, tokenize)
    
    
    train_pd = pd.DataFrame(columns=["subpart", "label"])

    for doc in train_dataset:
        for chunk in doc:
            train_pd = train_pd.append({"subpart": chunk["subpart"], "label": "__label__" + str(chunk["label"])}, ignore_index=True)

    train_pd[['subpart', 'label']].to_csv('train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")
    
    
    # Training the fastText classifier
    print("Training for category:", category)
    model = fasttext.train_supervised('train.txt', wordNgrams = 2, epoch=num_epochs, lr=0.9)
    
    preds, labels = get_preds_and_labels(test_dataset, model)
    
    chunk_accuracy, document_accuracy = calculate_chunk_and_doc_accuracy(preds, labels)
    soft_document_accuracy = calculate_soft_document_accuracy_ft(preds, labels)
    metrics = get_metrics_ft(calculate_metrics_ft(preds, labels))
    
    results = {"soft_doc_acc": None, "metrics": None, "chunk_acc": None, "doc_acc": None}
    
    results["metrics"] = metrics
    results["soft_doc_acc"] = soft_document_accuracy
    results["chunk_acc"] = chunk_accuracy
    results["doc_acc"] = document_accuracy
    
    with open(f"./results/fasttext/{category}_{num_epochs}.json", 'w') as fp:
            json.dump(results, fp)
            
    model.save_model(f"./models/fasttext/{category}_{num_epochs}.bin")

    print("Chunk accuracy:", chunk_accuracy)
    print("Document accuracy:", document_accuracy)
    print("Soft document accuracy:", soft_document_accuracy)    
    print("Metrics:", metrics)
    print()
    print("---------------------------------------------------")
    print()

In [None]:
categories = ["expiration_date", "anti-assignment", "cap_on_liability", "license_grant", "effective_date", "audit_rights", "termination_for_convenience"]
categories_2 = ["exclusivity", "renewal_term", "insurance", "revenueprofit_sharing", "volume_restriction"]

categories = categories + categories_2

run_with_categories(categories)