In [None]:
import os
import cv2
import pytesseract
from pytesseract import Output
from docx import Document

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return ' '.join(full_text)

def extract_text_from_image(file_path):
    image = cv2.imread(file_path)
    d = pytesseract.image_to_data(image, output_type=Output.DICT)
    text = ' '.join([d['text'][i] for i in range(len(d['text']))])
    return text

def preprocess_data(data_folder, file_type):
    texts = []
    for file_name in os.listdir(data_folder):
        if file_name.endswith(file_type):
            file_path = os.path.join(data_folder, file_name)
            if file_type == '.docx':
                text = extract_text_from_docx(file_path)
            elif file_type == '.png':
                text = extract_text_from_image(file_path)
            texts.append(text)
    return texts


In [None]:
import spacy
from spacy.training import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add the NER component to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)

# Add labels to the NER component
ner.add_label("AGREEMENT_VALUE")
ner.add_label("AGREEMENT_START_DATE")
ner.add_label("AGREEMENT_END_DATE")
ner.add_label("RENEWAL_NOTICE")
ner.add_label("PARTY_ONE")
ner.add_label("PARTY_TWO")

# Assuming you have a function to load your training data
train_data = load_training_data()

# Train the model
nlp.begin_training()
for itn in range(10):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5, losses=losses)
    print(losses)


In [None]:
def calculate_recall(true_positives, false_negatives):
    recall = true_positives / (true_positives + false_negatives)
    return recall

# Assuming you have a function to load your test data
test_data = load_test_data()

# Evaluate the model
true_positives = 0
false_negatives = 0
for text, annotations in test_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    pred_entities = [ent for ent in nlp(text).ents]
    for entity in annotations.get("entities", []):
        if entity in pred_entities:
            true_positives += 1
        else:
            false_negatives += 1

recall = calculate_recall(true_positives, false_negatives)
print(f"Recall: {recall}")
