In [1]:
import warnings
import os
import pandas as pd
import numpy as np
import spacy
warnings.filterwarnings("ignore")

# Import your DI container and components
from DependencyInjection.container import Container

In [2]:
# Initialize the container
container = Container()

def extract_resume_text(file_path):
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError('File not found')

        if file_path.split()[-1].endswith('.pdf'):
            extractor = container.pdf_extractor()
        elif file_path.split()[-1].endswith(('.doc', '.docx')):
            extractor = container.word_extractor()
        else:
            return "Error: Unsupported file format (only PDF/DOC/DOCX accepted)"

        extracted_text = extractor.extract_text(file_path)

        if not extracted_text:
            return "Error: No Text could be Extracted from file"

        return extracted_text

    except Exception as e:
        return f'Error: {e}'

In [3]:
"""
Why spaCy?
More accurate tokenization (handles hyphenated words, contractions, and punctuation better).
Built-in lemmatization (no need for additional libraries like NLTK's WordNet).
Faster (optimized Cython backend).
Scalable (supports custom pipelines for resume-specific terms).
"""
#  spaCy (better for resume parsing since it includes NER and POS out-of-the-box)
nlp = spacy.load('en_core_web_sm')

def process_text_with_spacy(text):
    doc = nlp(text)

    # Preprocessing of Text:
    #Tokenize the Text, Excluding Stop-words and Punctuations:
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]


    # Lemmatization:
    lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Named Entity Recognition
    entities = {ent.text: ent.label_ for ent in doc.ents}


    return {"tokens": tokens, "lemmas": lemmas, "entities": entities}

In [4]:
# Clean and Read the Resume and Description Data from the CSV's
resumeDF = pd.read_csv('data/Resume.csv', nrows = 100)
jdDF = pd.read_csv('data/fake_job_postings.csv', nrows = 100)

In [5]:
# Inspect the Values of both the DataFrames.
resumeDF.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
jdDF.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [7]:
resumeDF.info()
# Obbservations:
# No Null Values Present

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           100 non-null    int64 
 1   Resume_str   100 non-null    object
 2   Resume_html  100 non-null    object
 3   Category     100 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.3+ KB


In [8]:
jdDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               100 non-null    int64 
 1   title                100 non-null    object
 2   location             100 non-null    object
 3   department           27 non-null     object
 4   salary_range         13 non-null     object
 5   company_profile      88 non-null     object
 6   description          100 non-null    object
 7   requirements         89 non-null     object
 8   benefits             54 non-null     object
 9   telecommuting        100 non-null    int64 
 10  has_company_logo     100 non-null    int64 
 11  has_questions        100 non-null    int64 
 12  employment_type      83 non-null     object
 13  required_experience  61 non-null     object
 14  required_education   67 non-null     object
 15  industry             78 non-null     object
 16  function 

In [9]:
jdDF['requirements'] = jdDF['requirements'].fillna('').astype(str)

In [10]:
resume_text = []
jd_text = []

for i, row in resumeDF.iterrows():
    text = row['Resume_str'] + '\n\n'
    resume_text.append(text)

for i , row in jdDF.iterrows():
    text = row['title'] + '\n\n' + row['description'] + '\n\n' + row['requirements']
    jd_text.append(text)

In [11]:
'''
if __name__ == '__main__':
    file_path = "data/AllResumes.pdf"

    # Process the file
    resume = extract_resume_text(file_path)

    # Print results
    if resume.startswith("Error"):
        print(resume)
    else:
        print("Successfully extracted text:\n")
        print(resume)
'''

'\nif __name__ == \'__main__\':\n    file_path = "data/AllResumes.pdf"\n\n    # Process the file\n    resume = extract_resume_text(file_path)\n\n    # Print results\n    if resume.startswith("Error"):\n        print(resume)\n    else:\n        print("Successfully extracted text:\n")\n        print(resume)\n'

In [12]:
'''
job_description_path = 'data/Machine_Learning_Engineer_Job_Description.pdf'
jd = extract_resume_text(job_description_path).split('Job Description:', 1)[-1].strip()
print(jd)
'''

"\njob_description_path = 'data/Machine_Learning_Engineer_Job_Description.pdf'\njd = extract_resume_text(job_description_path).split('Job Description:', 1)[-1].strip()\nprint(jd)\n"

In [13]:
# To Tackle the Memory Error while processing the large texts we divide the text into chunks.
'''
def process_text_in_chunks(texts, batch_size):
    results = []
    for i in range(0, len(texts), batch_size):
        batch_text = " ".join(texts[i : i + batch_size])
        results.append(process_text_with_spacy(batch_text))
    return results
'''

'\ndef process_text_in_chunks(texts, batch_size):\n    results = []\n    for i in range(0, len(texts), batch_size):\n        batch_text = " ".join(texts[i : i + batch_size])\n        results.append(process_text_with_spacy(batch_text))\n    return results\n'

In [14]:
resume_processed = [process_text_with_spacy(text) for text in resume_text]

In [15]:
description_processed = [process_text_with_spacy(text) for text in jd_text]

In [16]:
'''
print("Resume Tokens:",  resume_processed["tokens"])
print("Resume Lemmas:", resume_processed["lemmas"])
print("Resume Entities:", resume_processed["entities"])
'''

'\nprint("Resume Tokens:",  resume_processed["tokens"])\nprint("Resume Lemmas:", resume_processed["lemmas"])\nprint("Resume Entities:", resume_processed["entities"])\n'

In [17]:
'''
print("JD Tokens:",  description_processed["tokens"])
print("JD Lemmas:", description_processed["lemmas"])
print("JD Entities:", description_processed["entities"])
'''

'\nprint("JD Tokens:",  description_processed["tokens"])\nprint("JD Lemmas:", description_processed["lemmas"])\nprint("JD Entities:", description_processed["entities"])\n'

In [18]:
model = container.ml_models()

In [19]:
resume_tokens_list = [doc["tokens"] for doc in resume_processed]
jd_tokens_list = [doc["tokens"] for doc in description_processed]
all_resume_tokens = [token for tokens in resume_tokens_list for token in tokens]
all_jd_tokens = [token for tokens in jd_tokens_list for token in tokens]

word_weights = model.load_TfIdfVectorizer(all_resume_tokens, all_jd_tokens)
print(word_weights)



In [None]:
resume_embeddings = []
jd_embeddings = []

#Load the S-BERT Model to get the Weighted Embeddings.
sbert_model = model.load_sbert_model()

for tokens in resume_tokens_list:
    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)
    resume_embeddings.append(embed)

for tokens in jd_tokens_list:
    embed = model.get_weighted_embeddings_from_sbert(tokens, word_weights, sbert_model)
    jd_embeddings.append(embed)

resume_embeddings = np.array(resume_embeddings)
jd_embeddings = np.array(jd_embeddings)

In [21]:
# Train the Resume Scoring Model
rf_model = model.train_scoring_model(resume_embeddings, jd_embeddings)

X shape: (50, 3)
Scores shape: (50,)
Train MSE: 0.00423231060405445
Test MSE: 0.006465293453156952
Train R2: 0.9889219979298057
Test R2: 0.9847092789323548


In [22]:
# Testing the Prediction Capacity of the Model:
# Extract text
resume_text_new = extract_resume_text("data/Rahul Victor Sunkara_UpdatedResume.pdf")
jd_text_new = extract_resume_text("data/FashionDesignerJD.pdf")

# Preprocess
resume_tokens_new = process_text_with_spacy(resume_text_new)["tokens"]
jd_tokens_new = process_text_with_spacy(jd_text_new)["tokens"]

# Generate embeddings
word_weights = model.load_TfIdfVectorizer(resume_tokens_new, jd_tokens_new)
resume_embed_new = model.get_weighted_embeddings_from_sbert(resume_tokens_new, word_weights)
jd_embed_new = model.get_weighted_embeddings_from_sbert(jd_tokens_new, word_weights)

# Predict score
score = model.predict_score(resume_embed_new, jd_embed_new, rf_model)
print(f"Predicted Match Score: {score:.1f}/100")

ValueError: Expected 2D array, got 1D array instead:
array=[-2.75613924e-02  2.16123672e-02 -1.77339060e-02  1.61657573e-02
 -2.59013305e-02 -1.52357459e-02  6.27015269e-02  1.60088971e-02
 -4.42703922e-03  7.29167533e-04  1.33913817e-02 -1.83818255e-02
  1.28576959e-02  4.90252333e-03 -2.15056141e-02  5.45038637e-03
  1.61617524e-03 -1.58126602e-02 -5.30738654e-02 -4.02332654e-02
 -4.64961330e-02  8.12994636e-05 -1.87387798e-02  9.42223861e-03
 -7.47445236e-03  3.52637773e-02 -1.49947834e-02  1.71869382e-02
  2.94694134e-02 -8.25217143e-02 -6.56720532e-03  1.94922627e-02
  3.35523176e-02  1.78561075e-03 -2.55452318e-02  1.26646188e-02
  6.74324958e-03 -7.08477020e-03  4.07882986e-03 -1.02708876e-02
 -2.75127888e-02 -5.68112652e-02 -7.08124799e-04  2.07724144e-03
  1.82567078e-02  2.20722997e-04  2.97187595e-03  1.22966265e-04
  1.48693964e-03  1.84336208e-02 -3.87698520e-02 -2.74498413e-02
 -2.95290473e-02  4.37495728e-03  1.15116140e-02  1.42185857e-02
 -8.23449428e-03  4.13096098e-03 -2.39688909e-03 -1.98678265e-02
  3.99904797e-02 -1.04006428e-02 -5.57972617e-02  4.11081087e-02
  4.43423146e-02 -1.35426252e-03 -5.15351051e-03  1.57285051e-02
 -1.20205540e-02 -2.60733785e-02 -2.81163095e-03  2.28365483e-03
 -1.93302109e-02  1.65678426e-02  3.74063484e-02 -1.87688585e-02
  2.54239295e-02 -1.77900142e-02  5.40380363e-02 -1.67795505e-02
  3.19098205e-03 -9.26673290e-03 -4.00272591e-02  2.17351723e-02
  4.38258412e-03 -3.78806714e-03  1.63684306e-02  1.45238488e-02
 -5.18024216e-03  1.01972075e-03 -9.98198176e-03 -5.31648911e-03
  2.63043146e-02  9.03742450e-04 -4.86017585e-02  1.11844819e-02
  9.75883681e-03 -4.76343776e-02 -2.07799550e-02  2.02436085e-01
  4.25451938e-03  2.56033591e-02 -5.99078772e-03  7.89582782e-04
 -1.87663240e-02 -2.40852789e-02 -4.87108088e-03  2.89859375e-02
  8.72853624e-03 -9.16679702e-04 -3.08683314e-02 -2.44662371e-03
 -3.53021491e-02  8.82367356e-04  1.90539736e-02  2.39863039e-03
 -5.74545326e-03  1.95697821e-02  2.15914160e-02  4.13490283e-03
  1.63907415e-02  1.80384306e-02 -2.05531187e-02 -4.48813211e-03
 -2.93759731e-02 -5.13657759e-02  3.62131085e-03 -4.06020532e-33
  1.46901529e-02 -2.02319437e-02  1.68837858e-02  3.05769183e-02
  1.80746643e-02  3.10536509e-05 -3.68167105e-03 -5.10408330e-03
 -3.56893306e-02  2.48383191e-02 -1.72228530e-02  3.41105498e-02
 -1.59330687e-02  1.18115312e-02  8.57220536e-02 -1.18896773e-02
  5.56576945e-03  4.65242998e-02 -1.22937569e-02 -1.53397573e-03
 -1.96939562e-02  5.24001515e-03  5.70252634e-03  2.41261365e-02
  9.63023901e-03 -9.63475443e-03  6.54438307e-03 -2.45525557e-02
  9.58625801e-03  1.36083251e-02  8.28452974e-03  1.91312476e-02
 -2.73659214e-02 -8.29977811e-03  6.97071757e-04  1.00480722e-03
 -3.78282107e-03 -4.81415564e-02  2.18125586e-02 -1.98332371e-02
 -1.68691950e-02  3.25558463e-03 -1.19723849e-02  1.94207353e-04
  7.21894612e-03  3.44730207e-02  3.54784396e-02  9.70312021e-03
 -1.51016732e-03  2.33223791e-02 -1.29044730e-02  2.89312870e-03
 -3.31769752e-02  2.49599959e-04  6.79305709e-03  3.01167489e-03
  8.89117059e-03 -2.01355941e-03  7.03940736e-03  9.32653825e-03
  3.08612747e-02  3.46458467e-02 -3.26191836e-02 -5.77891864e-03
 -2.34233898e-02 -1.04021156e-02  3.04915661e-03 -1.97863246e-02
  4.94978571e-02 -2.81101971e-03 -6.13248689e-02 -6.85646597e-03
  6.02358545e-02  5.62747303e-03 -1.12206841e-02 -7.59376018e-03
 -6.66514107e-03  2.93883412e-03 -2.17103380e-02  5.74774990e-03
 -3.71816541e-02 -3.72145101e-03 -1.59479410e-02  1.41717694e-02
  2.95744695e-02  9.99640924e-03 -2.96165657e-03 -4.32624643e-02
  4.91097270e-03  2.11642567e-02 -5.71607702e-02  3.25151378e-03
  2.70941574e-02  2.16133780e-02 -1.89224663e-02  2.88350633e-33
 -3.69792587e-02 -4.98922607e-03 -3.70023621e-02  6.22493572e-02
  1.76704710e-02 -5.75242635e-03  6.77245642e-03 -2.42767185e-03
 -1.37212310e-02  2.67270038e-02 -1.25775456e-02 -1.51717925e-02
  3.23228542e-02  1.22732994e-02 -3.52241474e-03  9.74075666e-03
  3.37214431e-02 -2.81543189e-02 -1.80441325e-02  1.90210709e-02
 -2.73487191e-02  9.08499704e-03 -1.37371402e-02 -7.42911771e-03
 -2.16479607e-02  3.96396179e-02  4.62534828e-03 -2.67647238e-03
 -2.86337179e-02 -1.47487321e-03  1.51245258e-02 -3.87280608e-02
 -3.93675174e-02  1.66842419e-02 -2.20947672e-02  3.56565416e-02
  4.40483343e-02  1.58564806e-02 -9.99469189e-03  2.80527182e-03
  5.42742832e-02 -6.12355630e-03 -8.97401184e-03  8.06993975e-02
 -6.90731506e-03 -1.19521545e-02 -9.88372212e-03  1.59149255e-02
  1.64599819e-02  1.56360994e-02 -3.25912909e-02  1.92400204e-03
 -8.49426818e-03 -3.53182945e-02 -8.68754906e-03 -1.02839577e-03
 -3.59757735e-03 -8.97723562e-03  5.81941072e-03  1.00738085e-02
 -6.47004681e-03  1.55356709e-02 -1.57077868e-02  3.55306587e-02
 -1.37002809e-02  2.43332496e-03 -1.31995345e-02  1.18926814e-02
 -1.48786215e-02  2.47676554e-03  3.13234610e-02  1.11496061e-02
 -2.80894303e-02 -2.76066727e-03 -2.66387221e-02 -2.41910543e-02
 -4.36446750e-02 -1.30480510e-03 -1.36665491e-02  1.42151840e-03
 -3.09794305e-02 -1.94525182e-02 -1.52110545e-03  3.28469505e-02
 -1.77381864e-02 -2.92955740e-03  4.31549506e-02 -5.23140547e-06
  3.65934262e-03 -1.74642878e-02 -1.46300056e-02  2.87583959e-02
 -1.07642566e-02  2.14101328e-03 -1.54135364e-02 -1.27996742e-08
 -9.14876003e-03  1.58394429e-03  5.36033296e-03 -9.00273559e-04
  3.48777699e-02  1.45848735e-02 -9.73329329e-03  1.86397584e-02
  7.45307896e-03  3.56514298e-02  1.29911609e-02 -6.19968378e-03
  7.81117009e-04  3.67639967e-02  4.13347820e-02 -8.57144485e-03
 -1.66723546e-02  2.97959439e-02 -3.55714015e-02 -1.00390009e-02
  1.37738541e-02  1.90390718e-02  1.32181078e-02  1.20031768e-02
  4.33255553e-03 -4.67766485e-03  7.01136884e-03  5.49923784e-02
  6.41281543e-03  2.60930521e-02  9.61823661e-03  4.04755340e-02
  9.25095309e-03 -2.76170676e-02  8.37602310e-03 -1.35388532e-02
  4.42825743e-03  1.68026899e-04  3.65851133e-03 -3.27362257e-03
 -1.67074284e-02  3.33054826e-03  2.34288893e-02 -5.57348362e-03
 -3.92768656e-03 -4.51950479e-03 -3.92036408e-03 -2.02578427e-02
 -3.34805575e-03 -2.49751800e-02 -1.51918817e-02 -2.11322243e-04
  2.34896067e-02  3.64894373e-02  3.95227303e-02  1.38519432e-02
  1.30315640e-02 -1.16833121e-02 -3.37147073e-02  3.83615443e-02
  7.70273376e-02  9.57892533e-03  2.28771951e-02  2.03595578e-04].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.