In [1]:
import numpy as np
import pandas as pd
import re
from transformers import pipeline, DebertaV2Tokenizer, DebertaV2ForQuestionAnswering
import torch

In [2]:
# ls local_data/unlabeled_contracts/2020

In [3]:
filepath = 'local_data/unlabeled_contracts/2020/000019119.txt'

In [4]:
with open(filepath, 'r', encoding="utf-8") as file:
    contract = file.read().replace('\n', '')

In [5]:
contract

'Exhibit 10.22\xa0CERTAIN CONFIDENTIAL INFORMATION CONTAINED IN THIS DOCUMENT, MARKED BY [***],HAS BEEN OMITTED BECAUSE IT IS BOTH (I) NOT MATERIAL AND (II)\xa0WOULD LIKELY CAUSECOMPETITIVE HARM IF PUBLICLY DISCLOSED.\xa0AMENDMENT NO. 1 TO THE COMMON STOCK ISSUANCE AGREEMENTThis Amendment No. 1 to the Common Stock Issuance Agreement (“Amendment”) ismade and entered into, effective as of December 17, 2019 (“Amendment EffectiveDate”), by and between Vir Biotechnology, Inc., a Delaware corporation withoffices at with an office at 499 Illinois Street, San Francisco, California94158 (“Vir”), and Alnylam Pharmaceuticals, Inc., a Delaware corporation locatedat 300 Third Street, Cambridge, Massachusetts 02142 (“Alnylam”). Each of Vir andAlnylam are referred to in this Amendment as a “Party” and together, the“Parties”.BackgroundWHEREAS, the Parties have entered into that certain Collaboration and LicenseAgreement effective as of October 16, 2017 (as amended by Letter Agreement datedNovember 13,

In [6]:
# Clean up and pre-process the text.
def pre_process_text(text):
    # Simple replacement for "\n"
    text = text.replace("\n", " ")     
    
    # Simple replacement for "\xa0"
    text = text.replace("\xa0", " ")  
    
    # Simple replacement for "\x0c"
    text = text.replace("\x0c", " ")
    
    # Get rid of multiple dots
    regex = "\ \.\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of underscores
    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple dashes
    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple stars
    regex = "\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple whitespace
    regex = "\ +"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    #Strip leading and trailing whitespace
    text = text.strip()
    
    return text


In [7]:
def get_questions_from_csv():
    df = pd.read_csv("./data/category_descriptions.csv")
    q_dict = {}
    for i in range(df.shape[0]):
        category = df.iloc[i, 0].split("Category: ")[1]
        description = df.iloc[i, 1].split("Description: ")[1]
        q_dict[category.title()] = description
    return q_dict

In [8]:
qtype_dict = get_questions_from_csv()
labels = [l for l in qtype_dict.keys()]
questions = [q for q in qtype_dict.values()]

In [9]:
# labels, questions

In [10]:
context = pre_process_text(contract)

In [11]:
# qapipe = pipeline('question-answering', model='./models/deberta-v2-xlarge', tokenizer='./models/deberta-v2-xlarge')

In [13]:
'''
for idx, question in enumerate(questions):
    answer = qapipe(question=question, context=context)
    print(f'{labels[idx]}: {answer["answer"]}')
'''

'\nfor idx, question in enumerate(questions):\n    answer = qapipe(question=question, context=context)\n    print(f\'{labels[idx]}: {answer["answer"]}\')\n'

In [14]:
tokenizer = DebertaV2Tokenizer.from_pretrained('./models/deberta-v2-xlarge')
model = DebertaV2ForQuestionAnswering.from_pretrained('./models/deberta-v2-xlarge')

In [15]:
def get_answers(questions, context):

    answers = []

    for question in questions:
        print(question)
        inputs = tokenizer(question, context, padding='max_length', truncation='only_second', return_tensors='pt')
        input_ids = inputs['input_ids'].tolist()[0]
        outputs = model(**inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits
        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        answers.append(answer)
        print(answer)
    return answers

In [16]:
# answers = get_answers(questions, context)

The name of the contract
AMENDMENT NO. 1 TO THE COMMON STOCK ISSUANCE AGREEMENT
The two or more parties who signed the contract
