# Step 2: Dataset Generation

In [1]:
import pandas as pd
from faker import Faker

In [2]:
classes = {
    0: "Services Provided", 
    1: "Payment", 
    2: "Term", 
    3: "Confidentiality", 
    4: "Termination", 
    5: "Governing Law", 
    6: "Signatures"
}

In [3]:
fake: Faker = Faker()

# Creating Contract Text using Faker 
def generate_contract_text() -> str:
    # generate random details
    service_provider_name = fake.company()
    client_name = fake.company()
    amount = fake.random_number(digits=5)
    start_date = fake.date_this_year()
    end_date = fake.date_this_year()
    state = fake.state()
    notice_days = fake.random_int(min=30, max=90)

    # Contract text template
    datasets =[
        [f"{service_provider_name} agrees to provide the following services to {client_name}. services are service1 service2, service3.", 0],
        [f"{client_name} agrees to pay {service_provider_name} the amount of ${amount} for the services described above. Payment shall be made within {notice_days} days of receiving an invoice from {service_provider_name}.", 1],
        [f"This contract will commence on {start_date} and will continue until {end_date} unless terminated earlier in accordance with the Termination clause.", 2],
        [f"Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.", 3],
        [f"Either party may terminate this contract with {notice_days} days written notice to the other party. In the event of termination, {service_provider_name} will be compensated for all services performed up to the date of termination.", 4],
        [f"This contract shall be governed by and construed in accordance with the laws of the State of {state}.", 5],
        [f"{service_provider_name}", 6],
        [f"{client_name}", 6]
    ]

    return pd.DataFrame(datasets)

In [4]:
df = generate_contract_text()

In [6]:
df.head()

Unnamed: 0,0,1
0,Parker-Clarke agrees to provide the following ...,0
1,Williams-Lindsey agrees to pay Parker-Clarke t...,1
2,This contract will commence on 2024-04-01 and ...,2
3,Both parties agree to maintain the confidentia...,3
4,Either party may terminate this contract with ...,4


In [15]:
df1 = generate_contract_text()

In [30]:
all_df = []
for i in range(10):
    all_df.append(generate_contract_text())
df_concat = pd.concat(all_df)
df_concat.columns = ["features", "label"]
df_concat.to_csv("test.csv", index=False)

# Step 3: Fine Tune The Bert Model to classify Clauses

In [17]:
import pandas as pd 
from sklearn.model_selection import train_test_split

# Read the dataset
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,features,label
0,Lozano-Ellis agrees to provide the following s...,0
1,"Day, Morrison and Vega agrees to pay Lozano-El...",1
2,This contract will commence on 2024-06-08 and ...,2
3,Both parties agree to maintain the confidentia...,3
4,Either party may terminate this contract with ...,4


In [2]:
df.shape

(400, 2)

In [3]:
# Train and Test data split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)
# Check the dataset
train_df.head()

Unnamed: 0,features,label
331,Both parties agree to maintain the confidentia...,3
212,Either party may terminate this contract with ...,4
301,This contract shall be governed by and constru...,5
235,Both parties agree to maintain the confidentia...,3
19,Both parties agree to maintain the confidentia...,3


In [4]:
train_df['label'].value_counts()

label
6    80
3    40
4    40
5    40
1    40
0    40
2    40
Name: count, dtype: int64

In [5]:
val_df['label'].value_counts()

label
6    20
2    10
4    10
3    10
1    10
5    10
0    10
Name: count, dtype: int64

### Tokenize the Text

In [6]:
from transformers import BertTokenizer

# Load Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize  the input text 
def tokenize_function(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

In [7]:
train_texts = train_df['features'].values
val_texts = val_df['features'].values

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [8]:
train_encodings['input_ids'][0]

tensor([  101,  2119,  4243,  5993,  2000,  5441,  1996, 18777,  3012,  1997,
         2151, 16350,  2030, 18777,  2592, 21362,  2076,  1996,  2744,  1997,
         2023,  3206,  1012,  2023, 14987,  2097,  3613,  3458,  1996, 18287,
         1997,  2023,  3206,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class ContractDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key:val[idx] for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = train_df['label'].values
val_labels = val_df['label'].values

train_dataset = ContractDataset(train_encodings, train_labels)
val_dataset = ContractDataset(val_encodings, val_labels)

In [10]:
# Example Dataset
example = iter(train_dataset)
example_item = next(example)
example_item

{'input_ids': tensor([  101,  2119,  4243,  5993,  2000,  5441,  1996, 18777,  3012,  1997,
          2151, 16350,  2030, 18777,  2592, 21362,  2076,  1996,  2744,  1997,
          2023,  3206,  1012,  2023, 14987,  2097,  3613,  3458,  1996, 18287,
          1997,  2023,  3206,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'label': tensor(3)}

### Fine-Tune the BERT Model 

In [11]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler, AdamW

# Load the Bert model of sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=8,
    shuffle=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [13]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# Training Loop
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    avg_train_loss = total_loss/len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")
        

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Loss: 1.0072647355496884
Epoch 2, Loss: 0.21494141407310963
Epoch 3, Loss: 0.17988903261721134
Epoch 4, Loss: 0.17756499648094176
Epoch 5, Loss: 0.17544155176728965


In [20]:
# Validation Loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(output.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Validation Accuracy:{accuracy:.4f}")
    
    

Validation Accuracy:1.0000


In [21]:
# Save the fine-tuned model and tokenizer
model_save_path = './fine_tuned_bert'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to ./fine_tuned_bert


# Step 4: Use the Fine Tuned Bert model for clause prediction

In [1]:
# Import Dependencies 
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [2]:
# Load the fine tuned model 
model_load_path = "./fine_tuned_bert"
model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(model_load_path)

In [3]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, max_length=128, return_tensors='pt', padding=True)

    # Move inputs to GPU if available
    inputs = {key:val.to(device) for key, val in inputs.items()}

    # Perform inference 
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1)
        return predicted_class.item()

In [15]:

# Example text for inference
example_text = "Cole LLC agrees to provide the following services to Hines, Munoz and Dennis. services are service1 service2, service3."
predicted_class = predict(example_text)
print(f"Predicted class: {predicted_class}")

Predicted class: 0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [53]:
random_sample = df.sample(1)
print(random_sample['features'].iloc[0])
print(random_sample['label'].iloc[0])

Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.
3


In [47]:
for i in range(5):
    random_sample = df.sample(1)
    example_text_2 = random_sample['features'].iloc[0]
    example_class_2 = random_sample['label'].iloc[0]
    predicted_class = predict(example_text_2)
    print(f"Actual :{classes[example_class_2]}\nPredicted : {classes[predicted_class]}\n\n")
    

Actual :Governing Law
Predicted : Governing Law


Actual :Governing Law
Predicted : Governing Law


Actual :Termination
Predicted : Termination


Actual :Term
Predicted : Term


Actual :Signatures
Predicted : Signatures




In [63]:
predict("__________________________")

6

# Step 5: Extract Data from PDF

In [65]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path:str) -> str:
    # Open the pdf
    doc = fitz.open(pdf_path)
    text = ""

    # Iterate over each page 
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num) # Load the page
        text += page.get_text() # Extract text from the page
    
    return text

In [92]:
pdf_path = "docs/Contract_2.pdf" # Replace with your PDF file path
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

Business Contract
1. Services Provided:
Chandler Inc agrees to provide the following services to Weaver Group. services are service1, service2, service3.
2. Payment:
Weaver Group agrees to pay Chandler Inc the amount of $56753 for the services described above. Payment shall be
made within 76 days of receiving an invoice from Chandler Inc.
3. Term:
This contract will commence on 2024-05-27 and will continue until 2024-05-16 unless terminated earlier in
accordance with the Termination clause.
4. Confidentiality:
Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the
term of this contract. This obligation will continue beyond the termination of this contract.
5. Termination:
Either party may terminate this contract with 76 days written notice to the other party. In the event of termination,
Chandler Inc will be compensated for all services performed up to the date of termination.
6. Governing Law:
This contract shall be gover

# Step 6: Classifying the main entities in each Clauses

In [81]:
import spacy

# Load spaCy's English model 
nlp = spacy.load("en_core_web_sm")


def extract_details(text):
    doc = nlp(text)

    # # Initialize dictionaries to store extracted details
    # details = {
    #     'services_provided': [],
    #     'payment': [],
    #     'term': [],
    #     'confidentiality': [],
    #     'termination': [],
    #     'governing_law': [],
    #     'parties': []
    # }

    # Iterate through entities recognized by spaCy
    # for ent in doc.ents:
    #     if ent.label_ in ('ORG', 'PERSON'):
    #         details['parties'].append(ent.text)
    #     elif ent.label_ == 'DATE':
    #         details['term'].append(ent.text)
    #     elif ent.label_ == 'MONEY':
    #         details['payment'].append(ent.text)
    #     elif 'service' in ent.text.lower():
    #         details['services_provided'].append(ent.text)
    #     elif 'confidential' in ent.text.lower() or 'proprietary' in ent.text.lower():
    #         details['confidentiality'].append(ent.text)
    #     elif 'termination' in ent.text.lower() or 'terminate' in ent.text.lower():
    #         details['termination'].append(ent.text)
    #     elif 'law' in ent.text.lower() or 'state' in ent.text.lower():
    #         details['governing_law'].append(ent.text)
    
    # return details
    entities = []
    for ent in doc.ents:
        entities.append([ent.text, ent.label_])
    return entities


In [83]:
 # Example text for extraction
example_text = """
Cole LLC agrees to provide the following services to Hines, Munoz and Dennis. Services are service1, service2, service3.
Hines, Munoz and Dennis agrees to pay Cole LLC the amount of $36777 for the services described above. Payment shall be made within 66 days of receiving an invoice from Cole LLC.
This contract will commence on 2024-03-01 and will continue until 2024-01-17 unless terminated earlier in accordance with the Termination clause.
Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.
Either party may terminate this contract with 66 days written notice to the other party. In the event of termination, Cole LLC will be compensated for all services performed up to the date of termination.
This contract shall be governed by and construed in accordance with the laws of the State of Nebraska.
Cole LLC
Hines, Munoz and Dennis
"""
for line in example_text.splitlines():
    print(line)
    extracted_details = extract_details(line)
    print(extracted_details)
    print()


[]

Cole LLC agrees to provide the following services to Hines, Munoz and Dennis. Services are service1, service2, service3.
[['Cole', 'PRODUCT'], ['Hines', 'ORG'], ['Munoz', 'GPE'], ['Dennis', 'PERSON'], ['service1', 'ORG'], ['service2', 'ORG'], ['service3', 'PRODUCT']]

Hines, Munoz and Dennis agrees to pay Cole LLC the amount of $36777 for the services described above. Payment shall be made within 66 days of receiving an invoice from Cole LLC.
[['Munoz', 'GPE'], ['Dennis', 'PERSON'], ['Cole', 'PRODUCT'], ['36777', 'MONEY'], ['66 days', 'DATE'], ['Cole', 'PRODUCT']]

This contract will commence on 2024-03-01 and will continue until 2024-01-17 unless terminated earlier in accordance with the Termination clause.
[['2024-03-01', 'DATE'], ['2024-01-17', 'DATE']]

Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.
[]

Either par

In [93]:
for line in extracted_text.splitlines():
    print(extract_details(line))

[]
[['1', 'CARDINAL']]
[['Chandler Inc', 'ORG'], ['Weaver Group', 'ORG'], ['service1', 'ORG'], ['service2', 'ORG'], ['service3', 'PRODUCT']]
[['2', 'CARDINAL']]
[['Weaver Group', 'ORG'], ['Chandler Inc', 'ORG'], ['56753', 'MONEY']]
[['76 days', 'DATE'], ['Chandler Inc.', 'ORG']]
[['3', 'CARDINAL']]
[['2024-05-27', 'DATE'], ['2024-05-16', 'DATE']]
[]
[['4', 'CARDINAL']]
[]
[]
[['5', 'CARDINAL']]
[['76 days', 'DATE']]
[['Chandler Inc', 'ORG']]
[['6', 'CARDINAL']]
[['Arizona', 'GPE']]
[['7', 'CARDINAL']]
[['Chandler Inc', 'ORG']]
[]
[['Weaver Group', 'ORG']]
[]


In [11]:
k = {"a":[2, 243, 2],"b":[43, 23, 1]}

In [12]:
def summa(a, b):
    return a, b

In [13]:
summa(**k)

([2, 243, 2], [43, 23, 1])

# 5. Train NER Model for BS Document

# Step 1: Prepare the Data

In [1]:
from faker import Faker
import pandas as pd
# import spacy
# from spacy.util import minibatch, compounding
# from spacy.training.example import Example
# import random

In [2]:
fake: Faker = Faker()

# Creating Contract Text using Faker 
def generate_spacy_dataset() -> pd.DataFrame:
    # generate random details
    service_provider_name = fake.company()
    client_name = fake.company()
    amount = fake.random_number(digits=5)
    start_date = fake.date_this_year()
    end_date = fake.date_this_year()
    state = fake.state()
    notice_days = fake.random_int(min=30, max=90)

    # Contract text template
    data = {
        0:f"{service_provider_name} agrees to provide the following services to {client_name}. services are service1 service2, service3.",
        1:f"{client_name} agrees to pay {service_provider_name} the amount of ${amount} for the services described above. Payment shall be made within {notice_days} days of receiving an invoice from {service_provider_name}.",
        2:f"This contract will commence on {start_date} and will continue until {end_date} unless terminated earlier in accordance with the Termination clause.",
        3:f"Both parties agree to maintain the confidentiality of any proprietary or confidential information disclosed during the term of this contract. This obligation will continue beyond the termination of this contract.",
        4:f"Either party may terminate this contract with {notice_days} days written notice to the other party. In the event of termination, {service_provider_name} will be compensated for all services performed up to the date of termination.",
        5:f"This contract shall be governed by and construed in accordance with the laws of the State of {state}."
        
    }
    
    datasets =[
        (data[0], [(data[0].find(service_provider_name), data[0].find(service_provider_name)+len(service_provider_name), "PARTY_A"), (data[0].find(client_name), data[0].find(client_name)+len(client_name), "PARTY_B")]),
        (data[1], [(data[1].find(client_name), data[1].find(client_name)+len(client_name), "PARTY_B"), (data[1].find(service_provider_name), data[1].find(service_provider_name)+len(service_provider_name), "PARTY_A"), (data[1].find(str(amount)), data[1].find(str(amount))+len(str(amount)), "MONEY"), (data[1].find(str(notice_days)), data[1].find(str(notice_days))+len(str(notice_days)), "NOTICE_DAYS")]),
        (data[2], [(data[2].find(str(start_date)), data[2].find(str(start_date))+len(str(start_date)), "START_DATE"), (data[2].find(str(end_date)), data[2].find(str(end_date))+len(str(end_date)), "END_DATE")]),
        (data[3], [(data[3].find("confidentiality"), data[3].find("confidentiality")+len("confidentiality"), "CONFIDENTIALITY")]),
        (data[4], [(data[4].find(str(notice_days)), data[4].find(str(notice_days))+len(str(notice_days)), "NOTICE_DAYS"), (data[4].find(service_provider_name), data[4].find(service_provider_name)+len(service_provider_name), "PARTY_A")]),
        (data[5], [(data[5].find(state), data[5].find(state)+len(state), "STATE")]),
    ]

    return pd.DataFrame(datasets)

In [3]:
all_dfs = []
for i in range(100):
    all_dfs.append(generate_spacy_dataset())

df = pd.concat(all_dfs)
df

Unnamed: 0,0,1
0,Nguyen-Medina agrees to provide the following ...,"[(0, 13, PARTY_A), (58, 67, PARTY_B)]"
1,Burch Ltd agrees to pay Nguyen-Medina the amou...,"[(0, 9, PARTY_B), (24, 37, PARTY_A), (53, 58, ..."
2,This contract will commence on 2024-05-10 and ...,"[(31, 41, START_DATE), (66, 76, END_DATE)]"
3,Both parties agree to maintain the confidentia...,"[(35, 50, CONFIDENTIALITY)]"
4,Either party may terminate this contract with ...,"[(46, 48, NOTICE_DAYS), (118, 131, PARTY_A)]"
...,...,...
1,Stephens and Sons agrees to pay Wallace-Finley...,"[(0, 17, PARTY_B), (32, 46, PARTY_A), (62, 67,..."
2,This contract will commence on 2024-03-14 and ...,"[(31, 41, START_DATE), (66, 76, END_DATE)]"
3,Both parties agree to maintain the confidentia...,"[(35, 50, CONFIDENTIALITY)]"
4,Either party may terminate this contract with ...,"[(46, 48, NOTICE_DAYS), (118, 132, PARTY_A)]"


In [4]:
df.columns = ["features", "labels"]
df.head()

Unnamed: 0,features,labels
0,Nguyen-Medina agrees to provide the following ...,"[(0, 13, PARTY_A), (58, 67, PARTY_B)]"
1,Burch Ltd agrees to pay Nguyen-Medina the amou...,"[(0, 9, PARTY_B), (24, 37, PARTY_A), (53, 58, ..."
2,This contract will commence on 2024-05-10 and ...,"[(31, 41, START_DATE), (66, 76, END_DATE)]"
3,Both parties agree to maintain the confidentia...,"[(35, 50, CONFIDENTIALITY)]"
4,Either party may terminate this contract with ...,"[(46, 48, NOTICE_DAYS), (118, 131, PARTY_A)]"


In [6]:
# Convert Data
# def convert_data(data:pd.DataFrame):
#     TRAIN_DATA = []
#     for idx, row in data.iterrows():
#         TRAIN_DATA.append((row["features"], {"entities":row["labels"]}))
#     return TRAIN_DATA

In [7]:
# # text, entities = df.iloc[0]
# for index, row in df.iterrows():
#     print(row["features"])
#     print(row["labels"])
#     break

In [8]:
# train_data = convert_data(df)
# train_data[0]

In [9]:
# # Create a blank English model
# nlp = spacy.blank("en")

In [10]:
# # Creating the NER component
# if 'ner' not in nlp.pipe_names:
#     ner = nlp.add_pipe('ner', last=True)
# else:
#     ner = nlp.get_pipe('ner')

In [11]:
# # Add custom entity labels to the NER component
# for _, annotations in train_data:
#     for entity in annotations.get("entities"):
#         ner.add_label(entity[2])

In [12]:
# # Start training
# nlp.begin_training()

In [13]:
# from spacy.training import offsets_to_biluo_tags

# def check_entity_alignment(nlp, text, entities):
#     doc = nlp.make_doc(text)
#     try:
#         tags = offsets_to_biluo_tags(doc, entities)
#         print(f"Entities align correctly for text: '{text}'")
#     except ValueError as e:
#         print(f"Error in text: '{text}'")
#         print(str(e))


In [14]:
# NUM_EPOCHS = 10

# # Training Loop
# for epoch in range(NUM_EPOCHS):
#     random.shuffle(train_data)
#     losses = {}
#     for batch in minibatch(train_data, size=compounding(4.0, 32.0, 1.001)):
#         examples = []
#         for text, annotations in batch:
#         #     check_entity_alignment(nlp, text, annotations)
#         # break  
#             doc = nlp.make_doc(text)
#             examples.append(Example.from_dict(doc, annotations))
            
#         nlp.update(examples, drop=0.5, losses=losses)
        
#     print(f"Epoch [{epoch+1}/{NUM_EPOCHS}]; Losses: {losses}")
    

In [15]:
!pip install nltk sklearn-crfsuite

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting tabulate>=0.4.2 (from sklearn-crfsuite)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp312-cp312-win_amd64.whl (154 kB)
   ---------------------------------------- 0.0/154.7 kB ? eta -:--:--
   ---------------------------------------- 154.7/154.7 kB 4.7 MB/s eta 0:00:00
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: python-crfsuite, tabulate, nltk, sklearn-crfsuite
Successfully installed nltk-3.8.1 python-crfsuite-0.9.10 sklearn-crfsuite-0.5.0 tabulate-0.9.0


In [27]:
import nltk
# import sk

In [58]:
def convert_to_conll(df):
    conll_format = []
    for _, row in df.iterrows():
        text = row['features']
        entities = row['labels']
        tokens = nltk.word_tokenize(text)
        token_start_indices = []
        current_index = 0

        for token in tokens:
            current_index = text.find(token, current_index)
            token_start_indices.append(current_index)
            current_index += len(token)
        token_end_indices = [start+len(token) for start, token in zip(token_start_indices, tokens)]
        labels = ['O']*len(tokens)

        for start_char, end_char, label in entities:
            # print(start_char, end_char, label)
            for i, (start, end) in enumerate(zip(token_start_indices, token_end_indices)):
                if start_char <= start < end_char or start_char < end <= end_char:
                    labels[i] = label

        for token, label in zip(tokens, labels):
            conll_format.append((token, label))
        conll_format.append(("", "")) # Add an empty line to separate sentences
        # print("Tokens:", tokens)
        # print("Token Start Indices:", token_start_indices)
        # print("Token End Indices:", token_end_indices)
        # print("Labels:", labels)

    return conll_format
        

In [59]:
train_data_conll = convert_to_conll(df)

In [61]:
train_data_conll[:5]

[('Nguyen-Medina', 'PARTY_A'),
 ('agrees', 'O'),
 ('to', 'O'),
 ('provide', 'O'),
 ('the', 'O')]

In [62]:
# Save to file (if needed )
with open("train_data.conll", "w") as f :
    for token, label in train_data_conll:
        if token:
            f.write(f"{token} {label}\n")
        else:
            f.write("\n")

# Step 2: Create Training and Testing Data

In [72]:
# Read the data 
def read_conll(file_path):
    data = []
    sentence = []
    with open(file_path, "r") as f:
        for line in f:
            if line.strip():
                token, label = line.strip().split()
                sentence.append((token, label))
            else:
                if sentence:
                    data.append(sentence)
                    sentence = []
    if sentence:
        data.append(sentence)
    return data

In [74]:
train_sents = read_conll("train_data.conll")
train_sents

[[('Nguyen-Medina', 'PARTY_A'),
  ('agrees', 'O'),
  ('to', 'O'),
  ('provide', 'O'),
  ('the', 'O'),
  ('following', 'O'),
  ('services', 'O'),
  ('to', 'O'),
  ('Burch', 'PARTY_B'),
  ('Ltd.', 'PARTY_B'),
  ('services', 'O'),
  ('are', 'O'),
  ('service1', 'O'),
  ('service2', 'O'),
  (',', 'O'),
  ('service3', 'O'),
  ('.', 'O')],
 [('Burch', 'PARTY_B'),
  ('Ltd', 'PARTY_B'),
  ('agrees', 'O'),
  ('to', 'O'),
  ('pay', 'O'),
  ('Nguyen-Medina', 'PARTY_A'),
  ('the', 'O'),
  ('amount', 'O'),
  ('of', 'O'),
  ('$', 'O'),
  ('79876', 'MONEY'),
  ('for', 'O'),
  ('the', 'O'),
  ('services', 'O'),
  ('described', 'O'),
  ('above', 'O'),
  ('.', 'O'),
  ('Payment', 'O'),
  ('shall', 'O'),
  ('be', 'O'),
  ('made', 'O'),
  ('within', 'O'),
  ('68', 'NOTICE_DAYS'),
  ('days', 'O'),
  ('of', 'O'),
  ('receiving', 'O'),
  ('an', 'O'),
  ('invoice', 'O'),
  ('from', 'O'),
  ('Nguyen-Medina', 'O'),
  ('.', 'O')],
 [('This', 'O'),
  ('contract', 'O'),
  ('will', 'O'),
  ('commence', 'O'),
  ('on

In [None]:
def word2features(sent, i):
    word:str = sent[i][0]
    features = {
        'bias':1.0,
        'word.lower()':word.lower(),
        'word[-3:]':word[-3:],
        'word[-2:]':word[-2:],
        'word.isupper()':word.isupper(),
        'word.istitle()':word.istitle(),
        'word.isdigit()':word.isdigit()
    }
    if i>0:
        word1:str = sent[i-1][0]
        features.update({
            '-1:word.lower()':word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True


    if i < len(sent)-1:
        word1:str = sent[i][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

     return features