# Initialization

In [None]:
# Open Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import random
import json
import subprocess
import spacy
import shutil

from spacy.tokens import DocBin, Doc, Span
from spacy.util import filter_spans
from google.colab import drive
from pathlib import Path

# Specify the path to your dataset and the target folder
dataset_path = '/content/drive/MyDrive/resume-shortlister/dataset/'
augmented_data_path = '/content/drive/MyDrive/resume-shortlister/augmented_data/'
spacy_dataset_path = '/content/drive/MyDrive/resume-shortlister/spacy_dataset/'

training_data_path = '/content/drive/MyDrive/resume-shortlister/training_data/'
testing_data_path = '/content/drive/MyDrive/resume-shortlister/testing_data/'

nlp = spacy.blank("en")

# Convert annotated data into JSON data of spacy type

In [None]:
def convert_json_to_spacy(json_path, output_path):
    # Load the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
        doc_bin = DocBin()  # Create a DocBin to store the documents

        # Iterate through the annotations
        for text, ann in data['annotations']:
            doc = nlp.make_doc(text)  # Create a Doc object from the text
            ents = []
            for start, end, label in ann['entities']:  # Extract entities
                span = doc.char_span(start, end, label=label)  # Create a span
                if span is not None:
                    ents.append(span)
            doc.ents = ents  # Assign the entities to the doc
            doc_bin.add(doc)  # Add the doc to the DocBin

    # Save the DocBin to disk
    doc_bin.to_disk(output_path)

Path(spacy_dataset_path).mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists
# Convert all JSON files in the directory
for json_file in Path(dataset_path).glob('*.json'):
    output_file = Path(spacy_dataset_path) / f'{json_file.stem}.spacy'
    convert_json_to_spacy(json_file, output_file)
    print(f"Converted {json_file.name} to .spacy format at {output_file}")

Converted Maleesha_Thalagala.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Maleesha_Thalagala.spacy
Converted Sachith_Priyashantha.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Sachith_Priyashantha.spacy
Converted Milinda_Senaka.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Milinda_Senaka.spacy
Converted Dinooli_Uduwarage_SE.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Dinooli_Uduwarage_SE.spacy
Converted Ayesha_Ekanayake.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Ayesha_Ekanayake.spacy
Converted Kavindi_Gunasekara.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Kavindi_Gunasekara.spacy
Converted Widuranga_Dilruksha.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Widuranga_Dilruksha.spacy
Converted Hasanga_Lakdinu.json to .spacy format at /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Hasang

# Data Augmentation

In [None]:
# Expanded Entity substitution map and lists
entity_map = {
    "TECHNICAL SKILLS": [
        "Python", "JavaScript", "C++", "Angular", "Java", "Spring Boot", "Rust", "Node.js", "Docker",
        "AWS", "TensorFlow", "React.js", "Kubernetes", "SQL", "NoSQL", "MongoDB", "Azure", "Machine Learning",
        "Deep Learning", "PyTorch", "Pandas", "Data Visualization", "Cybersecurity", "ERP Solutions", "Salesforce"
    ],
    "NON TECHNICAL SKILLS": [
        "leadership", "communication", "teamwork", "problem solving", "time management",
        "project management", "strategic planning", "customer service", "agile methodologies",
        "public speaking", "critical thinking", "negotiation", "adaptability", "emotional intelligence"
    ],
    "EXPERIENCES": [
        "internship at Google", "internship at Amazon", "software development at a startup",
        "research assistantship in machine learning", "part-time developer at a local tech firm",
        "software engineering at Virtusa", "IT consultancy at WSO2", "data analysis at Pearson Lanka",
        "cloud engineering at Sysco LABS", "project management at MillenniumIT ESP"
    ],
    "DEGREE": [
        "B.Sc. Computer Science", "M.Sc. Data Science", "Ph.D. in Artificial Intelligence",
        "B.Eng. Software Engineering", "B.Sc. in Information Technology",
        "B.Sc. in Network Technology", "M.Sc. in Cybersecurity", "B.Sc. in Interactive Media",
        "M.Sc. in Big Data Analytics", "Ph.D. in Computational Biology"
    ],
    "AL STREAM": [
        "Biology", "Information Technology", "Physical Science", "Commerce", "Arts",
        "Mathematics", "Engineering Technology", "Bio-system Technology", "Health Science"
    ],
    "PROJECTS": [
        "machine learning model for predicting stock prices", "web development for a local business",
        "mobile app for campus navigation", "blockchain-based voting system", "IoT home automation system",
        "AI-driven chatbot for customer service", "e-commerce platform optimization using data analytics",
        "virtual reality application for real estate", "augmented reality educational tools",
        "smart city traffic management system"
    ],
    "OTHER EXPERIENCES": [
        "volunteering at an NGO for teaching coding to kids", "teaching assistant in a computer science course",
        "freelance graphic designer for digital marketing", "tech blog writer", "open source contributor",
        "member of the IEEE Student Branch", "organizer of local hackathons", "mentor in coding bootcamps",
        "participant in National IT competitions", "volunteer in disaster recovery initiatives"
    ],
}

first_names = [
    "Thilini", "Avishka", "Asini", "Dulani", "Harini", "Yasas", "Mihindu", "Ashani", "Navindu", "Anuda",
    "Lahiru", "Chamari", "Isuru", "Madushani", "Nipuna", "Sanjaya", "Maneesha", "Roshan", "Kavindya", "Sachith",
    "Nirmal", "Pradeep", "Sahan", "Sanduni", "Pavithra", "Chathura", "Danushka", "Gayathri", "Hiruni", "Janith",
    "Kusal", "Malith", "Nadeesha", "Oshada", "Prashani", "Ruvini", "Sajith", "Tharindu", "Udara", "Vihanga",
    "Yohan", "Zahra", "Bimsara", "Chamath", "Dinuka", "Eranga", "Fathima", "Gihan", "Hansika", "Indeewari",
    "Jeewan", "Kasun", "Lasith", "Mahesh", "Nimasha", "Omal", "Piumi", "Rashmi", "Suresh", "Thushara"
]
last_names = [
    "Samarasinghe", "Fernando", "De Silva", "Ranasinghe", "Perera", "Amarasinghe", "Gunarathna", "Senaka", "Senarath", "Liyanage",
    "Jayawardena", "Rajapaksa", "Bandara", "Wickramasinghe", "Herath", "Silva", "Kumara", "Dissanayake", "Rathnayake", "Jayasuriya",
    "Karunaratne", "Weerasinghe", "Rodrigo", "Abeysekara", "Balasooriya", "Chandrasekara", "Dharmaratne", "Ekanayake", "Fonseka", "Gamage",
    "Hewavitharana", "Ilangaratne", "Jayakody", "Kotalawala", "Lakshman", "Mendis", "Nanayakkara", "Pathirana", "Ranatunga", "Seneviratne",
    "Thalagala", "Udawatte", "Vithanage", "Wijesinghe", "Yapa", "Zoysa", "Athukorala", "Bandaranaike", "Coomaraswamy", "Deshapriya"
]


In [None]:
def generate_text(new_data):
    """
    Generates descriptive text from structured data incorporating all attributes.
    Adds a period at the end to help with token boundary detection.
    """
    other_experiences = ', '.join(new_data['OTHER EXPERIENCES'])
    tech_skills = ', '.join(new_data['TECHNICAL SKILLS'])
    non_tech_skills = ', '.join(new_data['NON TECHNICAL SKILLS'])

    text = (
        f"{new_data['CANDIDATE NAME']} recently graduated with a {new_data['DEGREE']} "
        f"from the {new_data['AL STREAM']} stream, achieving a GPA of {new_data['GPA']}. "
        f"Worked on the project titled '{new_data['PROJECTS']}', gained experience as "
        f"{new_data['EXPERIENCES']}, and was involved in {other_experiences}. Known for expertise in {tech_skills} "
        f"and also possesses skills in {non_tech_skills}. "
        f"More on their professional profile can be seen at {new_data['LINKEDIN URL']} "
        f"or {new_data['GITHUB URL']}."
    ).strip() + '.'  # Ensuring the text ends with a period

    return text

def create_spacy_training_data(new_data, nlp):
    text = generate_text(new_data)
    doc = nlp(text)
    entities = []

    for label, value in new_data.items():
        if isinstance(value, list):
            value = ', '.join(value)  # Ensure this matches generate_text exactly
        value = str(value)  # Convert everything to string to avoid type issues

        start = text.find(value)
        if start == -1:
            print(f"Value for {label} not found in text: {value}")
            continue

        end = start + len(value)
        span = doc.char_span(start, end, alignment_mode='contract')
        if span is not None:
            entities.append((span.start_char, span.end_char, label))
        else:
            print(f"Skipping entity due to misalignment: {label} at {start}-{end}")

    return (text, {'entities': entities})

def augment_and_save_data(nlp):
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    candidate_name = f"{first_name} {last_name}"
    gpa = round(random.uniform(2.0, 4.0), 2)

    new_data = {
        "CANDIDATE NAME": candidate_name,
        "TECHNICAL SKILLS": random.sample(entity_map["TECHNICAL SKILLS"], random.randint(3, 5)),
        "NON TECHNICAL SKILLS": random.sample(entity_map["NON TECHNICAL SKILLS"], random.randint(3, 5)),
        "EXPERIENCES": random.choice(entity_map["EXPERIENCES"]),
        "DEGREE": random.choice(entity_map["DEGREE"]),
        "GPA": str(gpa),
        "AL STREAM": random.choice(entity_map["AL STREAM"]),
        "PROJECTS": random.choice(entity_map["PROJECTS"]),
        "OTHER EXPERIENCES": random.sample(entity_map["OTHER EXPERIENCES"], random.randint(1, 3)),
        "LINKEDIN URL": f"https://www.linkedin.com/in/{candidate_name.replace(' ', '.').lower()}/",
        "GITHUB URL": f"https://github.com/{candidate_name.replace(' ', '.').lower()}/"
    }

    training_example = create_spacy_training_data(new_data, nlp)
    new_filename = f"{candidate_name.replace(' ', '_')}.json"
    new_file_path = Path(augmented_data_path) / new_filename
    with open(new_file_path, 'w') as file:
        json.dump(training_example, file)

# Create the directory if it does not exist
Path(augmented_data_path).mkdir(parents=True, exist_ok=True)
# Generate and save multiple augmented data files
for _ in range(500):  # Adjust as needed
    augment_and_save_data(nlp)

print("Data augmentation complete and files saved.")

Data augmentation complete and files saved.


In [None]:
def convert_json_to_spacy(json_path, output_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)  # Load the JSON data
        doc_bin = DocBin()  # Create a DocBin to store the documents

        # Directly access text and annotations
        text = data[0]
        annotations = data[1]

        doc = nlp.make_doc(text)
        ents = []
        covered = set()

        for start, end, label in sorted(annotations['entities'], key=lambda x: x[0]):
            if any(pos in covered for pos in range(start, end)):
                print(f"Skipping overlapping entity {label} at {start}:{end}.")
                continue
            span = doc.char_span(start, end, label=label, alignment_mode='contract')
            if span:
                ents.append(span)
                covered.update(range(start, end))
            else:
                print(f"Skipping entity {label} at {start}:{end} due to misalignment.")

        doc.ents = ents
        doc_bin.add(doc)

    doc_bin.to_disk(output_path)
    print(f"Saved converted file to {output_path}")

def convert_directory(json_dir, spacy_dir):
    json_dir_path = Path(json_dir)
    spacy_dir_path = Path(spacy_dir)
    spacy_dir_path.mkdir(parents=True, exist_ok=True)

    for json_file in json_dir_path.glob('*.json'):
        output_file = spacy_dir_path / f'{json_file.stem}.spacy'
        convert_json_to_spacy(json_file, output_file)

convert_directory(augmented_data_path, spacy_dataset_path)

Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Eranga_Hewavitharana.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Lahiru_Dharmaratne.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Oshada_Abeysekara.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Kusal_Senarath.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Kavindya_Senarath.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Isuru_Dharmaratne.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Sahan_Wickramasinghe.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Harini_Vithanage.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Asini_Kumara.spacy
Saved converted file to /content/drive/MyDrive/RashmikaFYP/spacy_dataset/Eranga_Dharmaratne.spacy
Saved converted file to /conten

# Train Model

In [None]:
!python -m spacy init config --lang en --pipeline ner --optimize efficiency /content/drive/MyDrive/resume-shortlister/config/base_config.cfg --gpu-id 0


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



[paths]
train = "/content/drive/MyDrive/RashmikaFYP/spacy_dataset/"
dev = "/content/drive/MyDrive/RashmikaFYP/spacy_dataset/"
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.2
accumulate_gradient = 1
patience = 500
max_epochs = 100
max_steps = 20000
eval_frequency = 100
frozen_components = []
annotating_components = []
before_to_disk = null
before_update = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 10
stop = 50
compound = 1.001

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.1
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

In [None]:
# Finalize config
!python -m spacy init fill-config /content/drive/MyDrive/resume-shortlister/config/base_config.cfg /content/drive/MyDrive/resume-shortlister/config/final_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/RashmikaFYP/config/final_config.cfg
You can now add your data and train your pipeline:
python -m spacy train final_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# Define the path to your specific .spacy file
# spacy_file_path = '/content/drive/MyDrive/resume-shortlister/testing_data/Maleesha_Thalagala.spacy'  # Change 'your_file.spacy' to the actual file name
spacy_file_path = '/content/drive/MyDrive/resume-shortlister/training_data/Maleesha_Thalagala.spacy'  # Change 'your_file.spacy' to the actual file name

# Load a .spacy file
doc_bin = DocBin().from_disk(spacy_file_path)
docs = list(doc_bin.get_docs(nlp.vocab))

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)

MALEESHA THALAGALA CANDIDATE NAME
Level-4 Project: Identifying the Prevalence of Depression Using Twitter        
(Ongoing Research Project) PROJECTS
Internship Project at ISA: AeroCONNECT PROJECTS
B.SC(Hon’s)Information Technology and Management DEGREE
Java TECHNICAL SKILLS
Python TECHNICAL SKILLS
HTML TECHNICAL SKILLS
CSS TECHNICAL SKILLS
Bootstrap TECHNICAL SKILLS
MySQL TECHNICAL SKILLS
IntelliJ IDEA TECHNICAL SKILLS
Visual Studio TECHNICAL SKILLS
GIT TECHNICAL SKILLS
Bitbucket TECHNICAL SKILLS
Windows TECHNICAL SKILLS
Linux TECHNICAL SKILLS
https://www.linkedin.com/in/ma
leesha-thalagala/ LINKEDIN URL
https://github.com/MaleeshaTha
lagala96 GITHUB URL
Software Engineering Intern at Information Systems Associates (Pvt)Ltd: 
 August 2019 – January 2020 EXPERIENCES
Blogger at Medium: https://maleesha-16.medium.com/ OTHER EXPERIENCES
Writer at LinkIT -Information Technology Society (INTECS) - 
Faculty of Information Technology OTHER EXPERIENCES
Company coordinator – “FIT Future Ca

In [None]:
def split_data(dataset_path, training_data_path, testing_data_path, train_ratio=0.8):
    # Create directories for train and test datasets if they don't exist
    training_data_path.mkdir(parents=True, exist_ok=True)
    testing_data_path.mkdir(parents=True, exist_ok=True)

    # Gather all .spacy files in dataset directory
    all_files = list(dataset_path.glob('*.spacy'))
    random.shuffle(all_files)  # Shuffle to randomize

    # Calculate split index
    split_idx = int(len(all_files) * train_ratio)

    # Split files into training and testing
    train_files = all_files[:split_idx]
    test_files = all_files[split_idx:]

    # Move files to respective directories
    for file in train_files:
        shutil.move(str(file), str(training_data_path / file.name))

    for file in test_files:
        shutil.move(str(file), str(testing_data_path / file.name))

    print(f"Training files: {len(train_files)} moved to {training_data_path}")
    print(f"Testing files: {len(test_files)} moved to {testing_data_path}")

split_data(Path(spacy_dataset_path),Path(training_data_path), Path(testing_data_path))

Training files: 424 moved to /content/drive/MyDrive/RashmikaFYP/training_data
Testing files: 106 moved to /content/drive/MyDrive/RashmikaFYP/testing_data


In [None]:
!python -m spacy debug data /content/drive/MyDrive/resume-shortlister/config/final_config.cfg

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: tok2vec, ner
424 training docs
106 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (424)[0m
[1m
[38;5;4mℹ 84314 total word(s) in the data (7689 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 11 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence boundaries[0m
[1m
[38;5;2m✔ 7 checks passed[0m


In [None]:
!python -m spacy train /content/drive/MyDrive/resume-shortlister/config/final_config.cfg --output /content/drive/MyDrive/resume-shortlister/outputs/ --gpu-id 0

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/RashmikaFYP/outputs[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     54.30    0.00    0.00    0.00    0.00
  0     100       2574.46   6951.02   19.39   34.08   13.55    0.19
  0     200       3842.57   4774.85   35.73   46.06   29.19    0.36
  0     300       4556.45   3249.31   82.28   86.03   78.85    0.82
  0     400       3172.78   2567.83   85.44   87.03   83.92    0.85
  1     500       1364.54   1437.53   88.96   92.73   85.48    0.89
  1     600       1557.72   1376.10   87.08   86.82   87.34    0.87
  1     700        785.37    834.03   89.61   90.32   88.91    0.90
  1     800       8399.47   1303.93   89.46   90.11   88.83    0.89
  2    

# Using the model

In [None]:
!pip install spacy pdfplumber



In [None]:
import pdfplumber
import re
from datetime import datetime

# Path to your trained model
model_path = "/content/drive/MyDrive/resume-shortlister/outputs/model-best"  # or model-last
# Directory containing PDFs
pdf_directory = "/content/drive/MyDrive/resume-shortlister/resumes/"
nlp = spacy.load(model_path)  # Load your trained spaCy model

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using pdfplumber."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
            # else:
                # print(f"No text found on one of the pages of {pdf_path}")
    return text

def extract_gpa(text):
    """Extract GPA using regex from provided text."""
    gpa_pattern = r"\b(?:GPA|CGPA)[:\s]*([\d\.]+)\b"
    match = re.search(gpa_pattern, text, re.IGNORECASE)
    if match:
        return float(match.group(1))
    return None

def score_skills(entity_text, skills_dict):
    """Calculate the score for given skills in a case-insensitive manner."""
    entity_text_lower = entity_text.lower()
    return sum(points for skill, points in skills_dict.items() if skill.lower() in entity_text_lower)

def calculate_experience_years(text):
    """Calculate total experience by finding all years mentioned in the text."""
    current_year = datetime.now().year
    years = re.findall(r'\b(20\d{2})\b', text)  # Match pattern like '2020'
    years = list(map(int, years))
    if not years:
        return 0
    years.sort()
    # Calculate experience as difference between the earliest and latest year mentioned plus one
    total_experience = max(years) - min(years) + 1
    return total_experience

In [None]:
pdf_paths = list(Path(pdf_directory).glob("*.pdf"))  # List all PDF files in the directory

if pdf_paths:  # Check if there's at least one PDF
    pdf_path = pdf_paths[23]  # Select the first PDF
    with pdfplumber.open(pdf_path) as pdf:
        text = extract_text_from_pdf(str(pdf_path))
        doc = nlp(text)
        # Print each detected entity and its label
        for ent in doc.ents:
            print(f"Label: {ent.label_} - Entity: {ent.text}")
else:
    print("No PDF files found in the directory.")

Label: CANDIDATE NAME - Entity: YASIRU WITHANAGE
Label: DEGREE - Entity: B.SC (HONS.) IN INFORMATION TECHNOLOGY
Label: DEGREE - Entity: B.Sc (Hons) Degree in Information Technology
Label: GITHUB URL - Entity: https://github.com/withanageyasiru
Label: NON TECHNICAL SKILLS - Entity: Leadership skill
Label: NON TECHNICAL SKILLS - Entity: Quick learner
Label: PROJECTS - Entity: SMART TRAINER - INTERN PROJECT
DYNAMIC OBJECT-ORIENTED VIDEO AND
PROGRAMMING PROGRAMMING GRAPHIC DESIGN In this project, data was taken by Multi Sensored Arduino T-shirt and passed to
HOBBIES the server by using a connected mobile device
Label: TECHNICAL SKILLS - Entity: Visual Studio
Label: PROJECTS - Entity: AUGMENTED REALITY-BASED INDOOR NAVIGATION SYSTEM
(cid:44)(cid:81)(cid:71)(cid:82)(cid:82)(cid:85)(cid:3)(cid:81)(cid:68)(cid:89)(cid:76)(cid:74)(cid:68)(cid:87)(cid:76)(cid:82)(cid:81)(cid:3)(cid:86)(cid:92)(cid:4235)(cid:72)(cid:80)(cid:3)(cid:90)(cid:75)(cid:76)(cid:70)(cid:75)(cid:3)(cid:68)(cid:79)(cid:79)

In [None]:
def shortlist_pdfs(pdf_directory, nlp, gpa_range=(2.0, 4.0), tech_skills_points=None, non_tech_skills_points=None):
    if tech_skills_points is None:
        tech_skills_points = {"python": 2, "machine learning": 3, "java": 5, "sql": 2}
    if non_tech_skills_points is None:
        non_tech_skills_points = {"leadership": 1, "teamwork": 1}

    shortlisted_pdfs = []

    for pdf_path in Path(pdf_directory).glob("*.pdf"):
        text = extract_text_from_pdf(str(pdf_path))
        doc = nlp(text)
        score = 0

        # Extract GPA and calculate score
        gpa = extract_gpa(text)
        if gpa and gpa_range[0] <= gpa <= gpa_range[1]:
            score += 5

        # Calculate scores from skills
        for ent in doc.ents:
            if ent.label_ == "TECHNICAL SKILLS":
                score += score_skills(ent.text, tech_skills_points)
            elif ent.label_ == "NON TECHNICAL SKILLS":
                score += score_skills(ent.text, non_tech_skills_points)

        # Calculate experience years and add to score
        experience_years = calculate_experience_years(text)
        score += experience_years  # Add one point per year of experience

        shortlisted_pdfs.append((pdf_path.name, score))
        # print(f"Shortlisted: {pdf_path.name} with score: {score}")

    # Sort the list of shortlisted PDFs by score in descending order
    shortlisted_pdfs.sort(key=lambda x: x[1], reverse=True)
    return shortlisted_pdfs

In [None]:
shortlisted_pdfs = shortlist_pdfs(pdf_directory, nlp)

# Output the names of shortlisted PDFs in descending order of scores
print("Shortlisted PDFs in Descending Order of Scores:")
for pdf, score in shortlisted_pdfs[:20]:  # Limit to top 15
    print(f"{pdf}: {score}")

Shortlisted PDFs in Descending Order of Scores:
Milinda_Senaka_SE - Milinda Nandasena.pdf: 43
Janakee_Herath_SE - janakee herath.pdf: 40
Gihan_Kadawathage.pdf: 39
Chanduka_Samarasinghe.pdf: 39
Sinthujan.P_SE - Sinthujan Sintha.pdf: 38
lihini_jinanjalie QA - Lihini Jinanjalie.pdf: 37
Manodya_Abeysinghe_SE - Manodya Abeysinghe.pdf: 36
Udara_Wanasinghe_SE - W.A.U. WANASINGHE.pdf: 35
Nawodani_Jayasooriya_SE - W.M.L.N.K. JAYASOORIYA.pdf: 35
Sachith_Priyashantha_SE - R.S.S.S PRIYASHANTHA.pdf: 35
Vidya_Wimalasooriya.pdf: 35
Kelum_Sampath_SE - kelum sampath.pdf: 34
vaishnavi_QA - V. VELLAIPANDIAN.pdf: 33
Sanduni_Tharaka_SE - sanduni tharaka.pdf: 33
vaishnavi_QA (2) - vaishnavi pandiyan.pdf: 33
Nimesha_Herath_SE - H.M.N.D. HERATH.pdf: 31
Mahesh_Sarathchandra_SE - mahesh sarathchandra.pdf: 31
Sadheera_Mahanama_SE - S.S. MAHANAMA.pdf: 31
Sanduni_Wickramathunga_SE - sanduni wickramathunga.pdf: 31
Sanka_Dilshan_SE - sanka dilshan.pdf: 31
