In [1]:
import warnings
import os
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
warnings.filterwarnings("ignore")
from io import BytesIO

# Import your DI container and components
from DependencyInjection.container import Container




In [2]:
# Initialize the container
container = Container()

def extract_resume_text(file):
    try:
        extracted_text = ''
        if isinstance(file, str):
            if not os.path.exists(file):
                raise FileNotFoundError('File not found')

            if file.split()[-1].endswith('.pdf'):
                extractor = container.pdf_extractor()
            elif file.split()[-1].endswith(('.doc', '.docx')):
                extractor = container.word_extractor()
            else:
                return "Error: Unsupported file format (only PDF/DOC/DOCX accepted)"

            extracted_text = extractor.extract_text_from_path(file)

        elif isinstance(file, BytesIO) or hasattr(file, "read"):
            file_name = getattr(file, "name").lower()
            if file_name.endswith('.pdf'):
                extractor = container.pdf_extractor()
            elif file_name.endswith(('.doc', '.docx')):
                extractor = container.word_extractor()
            else:
                return "Error: Unsupported file format (only PDF/DOC/DOCX accepted)"

            extracted_text = extractor.extract_text_from_pdf_file(file)

        if extracted_text.isspace():
            return "Error: No Text could be Extracted from file"

        return extracted_text

    except Exception as e:
        return f'Error: {e}'

Why spaCy?
More accurate tokenization (handles hyphenated words, contractions, and punctuation better).
Built-in lemmatization (no need for additional libraries like NLTK's WordNet).
Faster (optimized Cython backend).
Scalable (supports custom pipelines for resume-specific terms).


In [3]:
#  spaCy (better for resume parsing since it includes NER and POS out-of-the-box)
nlp = spacy.load('en_core_web_sm')

def process_text_with_spacy(text):
    doc = nlp(text)

    # Preprocessing of Text:
    #Tokenize the Text, Excluding Stop-words and Punctuations:
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

    # Lemmatization:
    lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Named Entity Recognition
    entities = {ent.text: ent.label_ for ent in doc.ents}


    return {"tokens": tokens, "lemmas": lemmas, "entities": entities}

In [4]:
# Clean and Read the Resume and Description Data from the CSV's
resumeDF = pd.read_csv('data/Resume.csv', nrows = 100)
jdDF = pd.read_csv('data/fake_job_postings.csv', nrows = 100)

In [5]:
# Inspect the Values of both the DataFrames.
resumeDF.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
jdDF.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [7]:
resumeDF.info()
# Obbservations:
# No Null Values Present

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           100 non-null    int64 
 1   Resume_str   100 non-null    object
 2   Resume_html  100 non-null    object
 3   Category     100 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.3+ KB


In [8]:
jdDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               100 non-null    int64 
 1   title                100 non-null    object
 2   location             100 non-null    object
 3   department           27 non-null     object
 4   salary_range         13 non-null     object
 5   company_profile      88 non-null     object
 6   description          100 non-null    object
 7   requirements         89 non-null     object
 8   benefits             54 non-null     object
 9   telecommuting        100 non-null    int64 
 10  has_company_logo     100 non-null    int64 
 11  has_questions        100 non-null    int64 
 12  employment_type      83 non-null     object
 13  required_experience  61 non-null     object
 14  required_education   67 non-null     object
 15  industry             78 non-null     object
 16  function 

In [9]:
jdDF['requirements'] = jdDF['requirements'].fillna('').astype(str)

In [10]:
# Extract the Resumes and JD's
resume_text = []
jd_text = []

for i, row in resumeDF.iterrows():
    text = row['Resume_str'] + '\n\n'
    resume_text.append(text)

for i , row in jdDF.iterrows():
    text = row['title'] + '\n\n' + row['description'] + '\n\n' + row['requirements']
    jd_text.append(text)

if __name__ == '__main__':
file_path = "data"

Process the file
resume = extract_resume_text(file_path)

Print results
if resume.startswith("Error"):
    print(resume)
else:
    print("Successfully extracted text:\n")
    print(resume)
    # To Tackle the Memory Error while processing the large texts we divide the text into chunks.

def process_text_in_chunks(texts, batch_size):
    results = []
    for i in range(0, len(texts), batch_size):
        batch_text = " ".join(texts[i : i + batch_size])
        results.append(process_text_with_spacy(batch_text))
    return results


In [14]:
resume_processed = [process_text_with_spacy(text) for text in resume_text]

In [15]:
description_processed = [process_text_with_spacy(text) for text in jd_text]

In [18]:
model = container.ml_models()

In [19]:
resume_tokens_list = [doc["tokens"] for doc in resume_processed]
jd_tokens_list = [doc["tokens"] for doc in description_processed]
all_resume_tokens = [token for tokens in resume_tokens_list for token in tokens]
all_jd_tokens = [token for tokens in jd_tokens_list for token in tokens]

# word_weights = model.load_TfIdfVectorizer(all_resume_tokens, all_jd_tokens)
# print(word_weights)

'''
resume_embeddings = []
jd_embeddings = []
for tokens in resume_tokens_list:
    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)
    resume_embeddings.append(embed)

for tokens in jd_tokens_list:
    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)
    jd_embeddings.append(embed)

resume_embeddings = np.array(resume_embeddings)
jd_embeddings = np.array(jd_embeddings)
'''

In [20]:
#Load the S-BERT Model to get the Embeddings.
sbert_model = model.load_sbert_model()

'\nresume_embeddings = []\njd_embeddings = []\nfor tokens in resume_tokens_list:\n    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)\n    resume_embeddings.append(embed)\n\nfor tokens in jd_tokens_list:\n    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)\n    jd_embeddings.append(embed)\n\nresume_embeddings = np.array(resume_embeddings)\njd_embeddings = np.array(jd_embeddings)\n'

In [21]:
# Train the Resume Scoring Model
# rf_model = model.train_scoring_model(resume_embeddings, jd_embeddings)

Testing the Prediction Capacity of the Model:
jd_new_path = "data/Machine_Learning_Engineer_Job_Description.pdf"
resume_new_path = "data/Rahul Victor Sunkara_UpdatedResume.pdf"

Extract the Text
resume_text_new = extract_resume_text(resume_new_path)
jd_text_new = extract_resume_text(jd_new_path)  # Try Renaming to extract_text_from_pdf

Preprocess
resume_text_input = " ".join(process_text_with_spacy(resume_text_new)["lemmas"])
jd_text_input = " ".join(process_text_with_spacy(jd_text_new)["lemmas"])

Calculate matching percentage
match_percent = model.calculate_matching_score(jd_new_path, resume_new_path, sbert_model)
print(f"\nMatching Percentage: {match_percent:.2f}%")


In [None]:
import streamlit as st
from io import BytesIO

# FrontEnd Webpage using Streamlit.
st.set_page_config(page_title="Resume and JD Matching", layout="wide")
st.title("Resume Screener")

# Get the Job Description Text
jd_text_new = st.text_area("Input Job Description", height=300)

# Upload Resume
resume_file = st.file_uploader('Upload your Resume File in PDF / Docx Format.', type=['pdf', 'doc', 'docx'])

if st.button("Match Resume"):
    if not jd_text or not resume_file:
        st.warning("Please upload your Resume File and Job Description")
    else:
        with st.spinner('Processing...'):
            resume_text_new = extract_resume_text(resume_file)
            resume_text_input = " ".join(process_text_with_spacy(resume_text_new)["lemmas"])
            jd_text_input = " ".join(process_text_with_spacy(jd_text_new)["lemmas"])

            # Calculate matching percentage
            match_percent = model.calculate_matching_score(jd_text, resume_text, sbert_model)

        st.success(" Matching Completed!")
        st.markdown(f" Matching Score: `{match_percent:.2f}%`")

        # Optional: show as progress bar
        st.progress(min(match_percent / 100, 1.0))