# Task - Generate Missing Clauses from Legal Contracts

Import the necessary libraries

In [20]:
import os
import numpy as np
import pandas as pd
from docx import Document
import win32com.client as win32
import PyPDF2
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import torch
from torch import nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import warnings
warnings.filterwarnings('ignore')

Define the required paths

In [2]:
input_path = '../SampleDocs'
output_path = '../PDFs'

Function to save the .docx files to .pdf

In [3]:
def save_to_pdf(docx_path, pdf_path):
    doc = Document(docx_path)
    temp_doc_path = "temp.doc"
    doc.save(temp_doc_path)
    word = win32.Dispatch("Word.Application")
    try:
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(temp_doc_path))
        doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17)  # FileFormat 17 represents PDF
    finally:
        doc.Close()
        word.Quit()
    os.remove(temp_doc_path)

Function to read the text from .pdf files

In [4]:
def read_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        extracted_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            extracted_text += page.extract_text()
    return extracted_text

Saving all the .docx files to .pdf

In [5]:
for index, file in enumerate(os.listdir(input_path)):
    if file.endswith('.docx'):
        name = file[:-5]
        name = f'{name}.pdf'
        save_to_pdf(docx_path=f'{input_path}/{file}', pdf_path=f'{output_path}/{name}')

Initializing a blank dataframe

In [6]:
df = pd.DataFrame(columns=['Heading', 'Content'])

Define the headings of the legal documents in order

In [7]:
headings = [
    "Employment Agreement", 
    "Investment Commitment Agreement", 
    "Consulting Agreement", 
    "Contract for the sale of goods", 
    "Joint Venture Agreement",
    "Shareholder's Agreement",
    "Shareholder's Agreement",
    "Founders Agreement", 
    "Limited Liability Operating Agreement",
    "Offer Letter Agreement",
    "Collaboration Agreement",
    "Rental Agreement",
    "Sale Agreement",
    "Agreement"
]

Extract text from all the files and store it in the dataframe

In [8]:
for index, file in enumerate(os.listdir(output_path)):
    if file.endswith('.pdf'):
        row = []
        content = read_text_from_pdf(f'{output_path}/{file}')
        row = [headings[index], content]
        df.loc[index] = row

Save the dataframe to a .csv file

In [9]:
df.to_csv('dataset.csv')

Load the column 'contents' to a variable

In [10]:
content = df['Content']

The clauses are defined in the form of (x) or x.y.z. where x, y, z are numbers.
Therefore, the delimiters are defined as such with the required regular expressions

In [11]:
delimiters = [
    r'\(\d+\)',     
    r'\d+\.\d+\.\d+' 
]

Function to extract the clauses from content

In [12]:
def get_clauses(content):
    clauses = []
    current_clause = ''
    for row in content:
        for line in row.split('\n'):
            line = line.strip()
            if any(re.match(delimiter, line) for delimiter in delimiters):
                if current_clause:
                    clauses.append(current_clause.strip())
                    current_clause = ''
            current_clause += line + ' '
        if current_clause:
            clauses.append(current_clause.strip())
    return clauses

In [23]:
clauses = get_clauses(content)

Doc2Vec embeddings requires tagged documents

In [27]:
tagged = {}
for index, doc in enumerate(clauses):
    key = f"DOC_{index+1}"
    tagged[key] = TaggedDocument(words=doc, tags=[f"DOC_{index+1}"])

In [28]:
docs = []
for key in tagged:
    docs.append(tagged[key])

Initializing a Doc2Vec model

In [29]:
dim = 300
doc2vec = Doc2Vec(documents=docs, vector_size=dim, window=10, min_count=1)

Saving the embeddings and creating an embedding layer

In [30]:
embeddings = doc2vec.dv
embeddings.save('embeddings')
emb = embeddings.vectors
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(emb), freeze=True)

Initializing the GPT2-XL transformers model

In [83]:
model_name = "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Model structure before adding the embedding layer

In [82]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

Model Structure after adding the embedding layer

In [84]:
model.embeddings = embedding_layer
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
  (embeddings): Embedding(420, 300)
)

In [89]:
sample = clauses[0]
max_new_tokens = 512
sample = sample[:10]

In [72]:
input_ids = tokenizer.encode(sample, return_tensors='pt')

In [88]:
output = model.generate(input_ids, max_new_tokens=max_new_tokens, num_return_sequences=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [78]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Singhania & Partners LLP     Draft for Discussion 29th October, 2020   Privileged and confidential Employment  Agreement   Attorney work product Page 1 of 21   EMPLOYMENT AGREEMENT  Singhania & Partners LLP     Draft for Discussion 29th October, 2020   Privileged and confidential Employment  Agreement   Attorney work product Page 2 of 21 INDEX  1. DEFINITIONS AND INTERPRETATION .........................................................................................  4 2. EMPLOYMENT.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................