In [None]:
import json
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
import os
from sklearn.model_selection import train_test_split
 
nlp = spacy.blank("en")  
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")
 
with open("/content/train_data.json", "r", encoding="utf-8") as file:
    training_data = json.load(file)
 
def clean_overlapping_entities(entities):
    entities.sort(key=lambda x: x[0])
    cleaned_entities = []
    last_end = -1
 
    for start, end, label in entities:
        if start < last_end:  
            continue  
        cleaned_entities.append((start, end, label))
        last_end = end  
 
    return cleaned_entities
 
 
all_data = []
for entry in training_data:
    try:
        text, annotations = entry
        entities = annotations["entities"]
        cleaned_entities = clean_overlapping_entities(entities)
        all_data.append((text, {"entities": cleaned_entities}))
    except (ValueError, KeyError) as e:
        print(f"Data entry is incorrectly formatted: {entry}, Error: {e}")
 
 
train_data, test_data = train_test_split(all_data, test_size=0.15, random_state=42)
 
 
print(f"Total valid training entries: {len(train_data)}")
print(f"Total valid testing entries: {len(test_data)}")

In [None]:
import spacy
from pypdf import PdfReader
from docx import Document
 
# Load your trained NER model
model_output_path = "/content/drive/MyDrive/saved ner"
nlp = spacy.load(model_output_path)
 
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text
 
def extract_text_from_docx(docx_path):
    text = ""
    doc = Document(docx_path)
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text
 
def extract_entities_from_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities
 
file_path = "/content/SHOAIB.pdf"
 
 
if file_path.endswith('.pdf'):
    extracted_text = extract_text_from_pdf(file_path)
elif file_path.endswith('.docx'):
    extracted_text = extract_text_from_docx(file_path)
else:
    raise ValueError("Unsupported file type. Please provide a PDF or DOCX file.")
 
#print("Extracted Text:")
#print(extracted_text)
 
entities = extract_entities_from_text(extracted_text)
print("Extracted Entities:")
for entity in entities:
    print(entity)

In [None]:
import spacy
from sklearn.metrics import precision_recall_fscore_support
 
# Load the trained model
model_output_path = "/content/drive/MyDrive/saved ner"  # Adjust to your model path
nlp = spacy.load(model_output_path)
 
# Prepare to collect predictions and true values
y_true = []
y_pred = []
 
 
for text, annotations in test_data:
    doc = nlp(text)
 
    # Get predicted entities
    predicted_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
   
    # Prepare true entities
    true_entities = [(ent[0], ent[1], ent[2]) for ent in annotations['entities']]
   
    # Append true and predicted entities
    y_true.extend([label for _, _, label in true_entities])
    y_pred.extend([label for _, _, label in predicted_entities])
   
    print(f"Text: {text}")
    print(f"Predicted Entities: {predicted_entities}")
    print(f"True Entities: {true_entities}\n")
 
print(f"true labels: {len(y_true)}")
print(f"predicted labels: {len(y_pred)}")
 
if not y_pred:
    print("No predicted labels were found. Check the model output.")
 
if len(y_true) == len(y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
else:
    print("true != Predicted")