# Task - Generate Missing Clauses from Legal Contracts

Import the necessary libraries

In [33]:
import os
import numpy as np
import pandas as pd
from docx import Document
import win32com.client as win32
import PyPDF2
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import warnings
warnings.filterwarnings('ignore')

Define the required paths

In [2]:
input_path = '../SampleDocs'
testing_path = 'Testing'
output_path = '../PDFs'

Function to save the .docx files to .pdf

In [3]:
def save_to_pdf(docx_path, pdf_path):
    doc = Document(docx_path)
    temp_doc_path = "temp.doc"
    doc.save(temp_doc_path)
    word = win32.Dispatch("Word.Application")
    try:
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(temp_doc_path))
        doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17)  # FileFormat 17 represents PDF
    finally:
        doc.Close()
        word.Quit()
    os.remove(temp_doc_path)

Function to read the text from .pdf files

In [4]:
def read_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        extracted_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            extracted_text += page.extract_text()
    return extracted_text

Saving all the .docx files to .pdf

In [5]:
for index, file in enumerate(os.listdir(input_path)):
    if file.endswith('.docx'):
        name = file[:-5]
        name = f'{name}.pdf'
        save_to_pdf(docx_path=f'{input_path}/{file}', pdf_path=f'{output_path}/{name}')

Initializing a blank dataframe

In [8]:
df = pd.DataFrame(columns=['Heading', 'Content'])
test = df

Define the headings of the legal documents in order

In [9]:
headings = [
    "Employment Agreement", 
    "Investment Commitment Agreement", 
    "Consulting Agreement", 
    "Contract for the sale of goods", 
    "Joint Venture Agreement",
    "Shareholder's Agreement",
    "Shareholder's Agreement",
    "Founders Agreement", 
    "Limited Liability Operating Agreement",
    "Offer Letter Agreement",
    "Collaboration Agreement",
    "Rental Agreement",
    "Sale Agreement",
    "Agreement"
]

test_headings = [
    "Joint Venture Agreement",
    "Shareholder's Agreement",
    "Founders Agreement",
    "Sale Agreement"
]

Extract text from all the files and store it in the dataframe

In [None]:
for index, file in enumerate(os.listdir(output_path)):
    if file.endswith('.pdf'):
        row = []
        content = read_text_from_pdf(f'{output_path}/{file}')
        row = [headings[index], content]
        df.loc[index] = row

In [None]:
for index, file in enumerate(os.listdir(f'../{testing_path}')):
    if file.endswith('.pdf'):
        row = []
        content = read_text_from_pdf(f'../{testing_path}/{file}')
        row = [headings[index], content]
        test.loc[index] = row

Save the dataframes to a .csv file

In [None]:
df

In [None]:
test

In [13]:
df = pd.read_csv('dataset.csv')
test = pd.read_csv('test.csv')

Load the column 'contents' to a variable

In [14]:
content = df['Content']
test_content = test['Content']

The clauses are defined in the form of (x) or x.y.z. where x, y, z are numbers.
Therefore, the delimiters are defined as such with the required regular expressions

In [15]:
delimiters = [
    r'\(\d+\)',     
    r'\d+\.\d+\.\d+' 
]

Function to extract the clauses from content

In [10]:
def get_clauses(content):
    clauses = []
    current_clause = ''
    for row in content:
        for line in row.split('\n'):
            line = line.strip()
            if any(re.match(delimiter, line) for delimiter in delimiters):
                if current_clause:
                    clauses.append(current_clause.strip())
                    current_clause = ''
            current_clause += line + ' '
        if current_clause:
            clauses.append(current_clause.strip())
    return clauses

In [16]:
clauses = get_clauses(content)
test_clauses = get_clauses(test_content)

Doc2Vec embeddings requires tagged documents

In [17]:
tagged = {}
for index, doc in enumerate(clauses):
    key = f"DOC_{index+1}"
    tagged[key] = TaggedDocument(words=doc, tags=[f"DOC_{index+1}"])

In [18]:
docs = []
for key in tagged:
    docs.append(tagged[key])

Initializing a Doc2Vec model

In [19]:
dim = 300
doc2vec = Doc2Vec(documents=docs, vector_size=dim, window=10, min_count=1)

Saving the embeddings and creating an embedding layer

In [20]:
embeddings = doc2vec.dv
embeddings.save('embeddings')
emb = embeddings.vectors
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(emb), freeze=True)

Initializing the GPT2-XL transformers model

In [35]:
model_name = "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Model structure before adding the embedding layer

In [22]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

Model Structure after adding the embedding layer

In [23]:
model.embeddings = embedding_layer
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
  (embeddings): Embedding(665, 300)
)

Initialize a custom dataset class

In [24]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data[idx]
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True)
        return torch.tensor(input_ids[:-1]), torch.tensor(input_ids[1:])

Create train and validation datasets

In [76]:
train_dataset = CustomDataset(clauses[:1024], tokenizer)
valid_dataset = CustomDataset(test_clauses[:1024], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1)

Set the device to GPU

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Define the Optimizer

In [27]:
optimizer = AdamW(model.parameters(), lr=1e-5)

In [77]:
model.train()
for epoch in range(5):
    for batch in train_loader:
        input_ids, labels = batch[:1024]
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    with torch.no_grad():
        total_loss = 0
        total_samples = 0
        for batch in valid_loader:
            input_ids, labels = batch[:1024]
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * input_ids.size(0)
            total_samples += input_ids.size(0)
        avg_loss = total_loss / total_samples
        print(f"Validation Loss: {avg_loss}")
    model.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (6329 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

In [None]:
output_dir = "./clause-model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)