In [None]:
#Extracting text from round 800 resumes 
import os
from pypdf import PdfReader
from docx import Document


directory = 'all resumes'

output_file_path = 'extracted_resumes.txt'

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        
        if filename.endswith('.pdf'):
            pdf_reader = PdfReader(file_path)
           
            for pdf_page in pdf_reader.pages:
                output_file.write(pdf_page.extract_text() )
          

        
        elif filename.endswith('.docx'):
            doc = Document(file_path)
            for paragraph in doc.paragraphs:
                output_file.write(paragraph.text)
            
print(output_file_path)
print(f"Extracted content saved to {output_file_path}")

In [None]:
#Training Model using data that we extracted from pdfs and docx 

import json
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
import os
from sklearn.model_selection import train_test_split
 
nlp = spacy.blank("en")  
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")
 
with open("/content/train_data.json", "r", encoding="utf-8") as file:
    training_data = json.load(file)
 
def clean_overlapping_entities(entities):
    entities.sort(key=lambda x: x[0])
    cleaned_entities = []
    last_end = -1
 
    for start, end, label in entities:
        if start < last_end:  
            continue  
        cleaned_entities.append((start, end, label))
        last_end = end  
 
    return cleaned_entities
 
 
all_data = []
for entry in training_data:
    try:
        text, annotations = entry
        entities = annotations["entities"]
        cleaned_entities = clean_overlapping_entities(entities)
        all_data.append((text, {"entities": cleaned_entities}))
    except (ValueError, KeyError) as e:
        print(f"Data entry is incorrectly formatted: {entry}, Error: {e}")
 
 
train_data, test_data = train_test_split(all_data, test_size=0.15, random_state=42)
 
 
print(f"Total valid training entries: {len(train_data)}")
print(f"Total valid testing entries: {len(test_data)}")
 
 
for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])
 
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
 
# Training the model
n_iter = 750
with nlp.disable_pipes(*unaffected_pipes):  
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        losses = {}
   
        random.shuffle(train_data)
   
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.3, losses=losses)
        print(f"Iteration {itn + 1}/{n_iter}, Loss: {losses['ner']}")
 
# Specify the path in your Google Drive
model_output_path = "/content/drive/MyDrive/saved ner"  
os.makedirs(model_output_path, exist_ok=True)  
 
#nlp.to_disk(model_output_path)
#print(f"Model training complete and saved to {model_output_path}")

In [None]:
#Testing the trained model
import spacy
from sklearn.metrics import precision_recall_fscore_support
 
# Load the trained model
model_output_path = "/content/drive/MyDrive/saved ner"  # Adjust to your model path
nlp = spacy.load(model_output_path)
 
y_true = []
y_pred = []
 
 
for text, annotations in test_data:
    doc = nlp(text)
 
    
    predicted_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
   

    true_entities = [(ent[0], ent[1], ent[2]) for ent in annotations['entities']]
   

    y_true.extend([label for _, _, label in true_entities])
    y_pred.extend([label for _, _, label in predicted_entities])
   
    print(f"Text: {text}")
    print(f"Predicted Entities: {predicted_entities}")
    print(f"True Entities: {true_entities}\n")
 
print(f"true labels: {len(y_true)}")
print(f"predicted labels: {len(y_pred)}")
 
if not y_pred:
    print("No predicted labels were found. Check the model output.")
 
if len(y_true) == len(y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
else:
    print("true != Predicted")

In [None]:
#Finally checking how much our trained model is working
import spacy
from pypdf import PdfReader
from docx import Document
 
# Load your trained NER model
model_output_path = "/content/drive/MyDrive/saved ner"
nlp = spacy.load(model_output_path)
 
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text
 
def extract_text_from_docx(docx_path):
    text = ""
    doc = Document(docx_path)
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text
 
def extract_entities_from_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities
 
file_path = "/content/SHOAIB.pdf" # change pfd or docx accordingly
 
 
if file_path.endswith('.pdf'):
    extracted_text = extract_text_from_pdf(file_path)
elif file_path.endswith('.docx'):
    extracted_text = extract_text_from_docx(file_path)
else:
    raise ValueError("Unsupported file type. Please provide a PDF or DOCX file.")
 
#print("Extracted Text:")
#print(extracted_text)
 
entities = extract_entities_from_text(extracted_text)
print("Extracted Entities:")
for entity in entities:
    print(entity)