# Task - Generate Missing Clauses from Legal Contracts

Import the necessary libraries

In [None]:
import os
import numpy as np
import pandas as pd
from docx import Document
import win32com.client as win32
import PyPDF2
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from gensim.models import Doc2Vec
from torch import nn
from transformers import LongformerModel
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models.doc2vec import TaggedDocument

In [None]:
input_path = 'SampleDocs'
output_path = 'PDFs'

In [None]:
def save_to_pdf(docx_path, pdf_path):
    doc = Document(docx_path)
    temp_doc_path = "temp.doc"
    doc.save(temp_doc_path)
    word = win32.Dispatch("Word.Application")
    try:
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(temp_doc_path))
        doc.SaveAs(os.path.abspath(pdf_path), FileFormat=17)  # FileFormat 17 represents PDF
    finally:
        doc.Close()
        word.Quit()
    os.remove(temp_doc_path)

In [None]:
def read_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        extracted_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            extracted_text += page.extract_text()
    return extracted_text

In [None]:
for index, file in enumerate(os.listdir(input_path)):
    if file.endswith('.docx'):
        final = file[:-5]
        final = f'{final}.pdf'
        save_to_pdf(docx_path=f'{input_path}/{file}', pdf_path=f'{output_path}/{final}')

In [None]:
df = pd.DataFrame(columns=['Heading', 'Content'])

In [None]:
headings = [
    'Employment Agreement', 
    'Investment Commitment Agreement', 
    'Consulting Agreement', 
    'Contract for the sale of goods', 
    'Joint Venture Agreement',
    "Shareholder's Agreement",
    "Shareholder's Agreement",
    'Founders Agreement', 
    'Limited Liability Operating Agreement',
    'Offer Letter Agreement',
    'Collaboration Agreement',
    'Rental Agreement',
    'Sale Agreement',
    'Agreement'
]

In [None]:
for index, file in enumerate(os.listdir(output_path)):
    if file.endswith('.pdf'):
        row = []
        content = read_text_from_pdf(f'{output_path}/{file}')
        row = [headings[index], content]
        df.loc[index] = row

In [None]:
df.to_csv('dataset.csv')

In [None]:
X = df['Content']

In [None]:
clause_delimiters = [
    r'\(\d+\)',     
    r'\d+\.\d+\.\d+' 
]

In [None]:
def get_clauses(documents):
    clauses = []
    current_clause = ''
    for row in documents:
        for line in row.split('\n'):
            line = line.strip()
            if any(re.match(delimiter, line) for delimiter in clause_delimiters):
                if current_clause:
                    clauses.append(current_clause.strip())
                    current_clause = ''
            current_clause += line + ' '
        if current_clause:
            clauses.append(current_clause.strip())
    return clauses

In [None]:
num_words = 0
for row in X:
    if len(row) > num_words:
        num_words = len(row)

In [None]:
for index, row in enumerate(X):
    row = sent_tokenize(row)
    X.loc[index] = row

In [None]:
tagged = {}
for index, doc in enumerate(X):
    key = f"DOC_{index+1}"
    tagged[key] = TaggedDocument(words=doc, tags=[f"DOC_{index+1}"])

In [None]:
docs = []
for key in tagged:
    docs.append(tagged[key])

In [None]:
dim = 200
doc2vec = Doc2Vec(documents=docs, vector_size=dim, window=10, min_count=1)

In [None]:
embeddings = doc2vec.dv
embeddings.save('embeddings')

In [None]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer_ = GPT2Tokenizer.from_pretrained(model_name)
emb = embeddings.vectors
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(emb), freeze=True)
model.embeddings = embedding_layer