In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/resume-and-job-description/training_data.csv
/kaggle/input/resume-and-job-description/Resume.csv
/kaggle/input/resume-dataset/Resume/Resume.csv
/kaggle/input/resume-dataset/data/data/DESIGNER/22506245.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/13998435.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/67582956.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/34349255.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/26790545.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/12674307.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/11807040.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/44145704.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/27497542.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/11155153.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/17555081.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/41506705.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/54201930.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/2

In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from pypdf import PdfReader
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
import re
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = "".join(page.extract_text() for page in reader.pages)
    return text

def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    sentences = sent_tokenize(text)
    features = {'feature': ""}
    stop_words = set(stopwords.words("english"))
    for sent in sentences:
        if any(criteria in sent for criteria in ['skills', 'education']):
            words = word_tokenize(sent)
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words)
            filtered_words = [word for word, tag in tagged_words if tag not in ['DT', 'IN', 'TO', 'PRP', 'WP']]
            features['feature'] += " ".join(filtered_words)
    return features

def process_resume_data(df):
    id = df['ID']
    category = df['Category']
    text = extract_text_from_pdf(f"/kaggle/input/resume-dataset/data/data/{category}/{id}.pdf")
    features = preprocess_text(text)
    df['Feature'] = features['feature']
    return df

def get_embeddings(text, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
    return embeddings

def print_top_matching_resumes(result_group):
    for i in range(15):
        print("\nJob ID:", i)
        print("Cosine Similarity | Domain Resume | Domain Description")
        print(result_group.get_group(i)[['similarity', 'domainResume', 'domainDesc']])

def main():
    resume_data = pd.read_csv("/kaggle/input/resume-dataset/Resume/Resume.csv")
    resume_data = resume_data.drop(["Resume_html"], axis=1)
    resume_data = resume_data.apply(process_resume_data, axis=1)
    resume_data = resume_data.drop(columns=['Resume_str'])
    resume_data.to_csv("/kaggle/working/resume_data.csv", index=False)

    job_description = pd.read_csv("/kaggle/input/resume-and-job-description/training_data.csv")
    job_description = job_description[["job_description", "position_title"]][:15]
    job_description['Features'] = job_description['job_description'].apply(lambda x : preprocess_text(x)['feature'])

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)

    job_desc_embeddings = np.array([get_embeddings(desc, model_name) for desc in job_description['Features']]).squeeze()
    resume_embeddings = np.array([get_embeddings(text, model_name) for text in resume_data['Feature']]).squeeze()

    result_df = pd.DataFrame(columns=['jobId', 'resumeId', 'similarity', 'domainResume', 'domainDesc'])

    for i, job_desc_emb in enumerate(job_desc_embeddings):
        similarities = cosine_similarity([job_desc_emb], resume_embeddings)
        top_k_indices = np.argsort(similarities[0])[::-1][:5]
        for j in top_k_indices:
            result_df.loc[i+j] = [i, resume_data['ID'].iloc[j], similarities[0][j], resume_data['Category'].iloc[j], job_description['position_title'].iloc[i]]

    result_df = result_df.sort_values(by='similarity', ascending=False)
    result_group = result_df.groupby("jobId")
    print_top_matching_resumes(result_group)

if __name__ == "__main__":
    main()



Job ID: 0
Cosine Similarity | Domain Resume | Domain Description
      similarity            domainResume        domainDesc
629     0.938355    BUSINESS-DEVELOPMENT  Sales Specialist
299     0.930880  INFORMATION-TECHNOLOGY  Sales Specialist
577     0.930776    BUSINESS-DEVELOPMENT  Sales Specialist
1235    0.930246           DIGITAL-MEDIA  Sales Specialist
1045    0.929638                   SALES  Sales Specialist

Job ID: 1
Cosine Similarity | Domain Resume | Domain Description
      similarity   domainResume                  domainDesc
2311    0.908976           ARTS  Apple Solutions Consultant
1048    0.903436          SALES  Apple Solutions Consultant
2151    0.901145        BANKING  Apple Solutions Consultant
1300    0.899512  DIGITAL-MEDIA  Apple Solutions Consultant
1227    0.893605  DIGITAL-MEDIA  Apple Solutions Consultant

Job ID: 2
Cosine Similarity | Domain Resume | Domain Description
      similarity   domainResume                                 domainDesc
2289    0.956