In [1]:
import warnings
warnings.filterwarnings("ignore")



In [2]:
import pandas as pd
df = pd.read_csv("data_science_job_cleaned.csv")

### Extracting similar resume words that are keywords present in job description

In [3]:
import PyPDF2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        text = ""

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    keyword_count = {i:words.count(i) for i in words}

    return keyword_count

# Example usage
pdf_path = 'Pranjal_Rane_Resume.pdf'
user_keyword_count = extract_keywords_from_pdf(pdf_path)
user_keyword_count = dict(sorted(user_keyword_count.items(), key=lambda item: item[1], reverse=True))
print("Keywords:", user_keyword_count)


Keywords: {'data': 13, 'cloud': 6, 'accuracy': 6, 'model': 6, 'using': 5, 'like': 5, 'face': 5, 'stock': 5, 'learning': 5, 'science': 4, 'techniques': 4, 'management': 4, 'implementing': 4, 'technologies': 4, 'deep': 4, 'achieving': 4, 'india': 3, 'rag': 3, 'application': 3, 'deployment': 3, 'api': 3, 'databricks': 3, 'enhancing': 3, 'user': 3, 'detection': 3, 'architecture': 3, 'credit': 3, 'classification': 3, 'sql': 3, 'programming': 3, 'design': 3, 'testing': 3, 'dynamodb': 3, 'aws': 3, 'computer': 2, 'gpa': 2, 'database': 2, 'technology': 2, 'pune': 2, 'engineering': 2, 'ai': 2, 'retrieval': 2, 'generation': 2, 'llama': 2, 'gpt': 2, 'chat': 2, 'bot': 2, 'processing': 2, 'time': 2, 'efficiency': 2, 'analysis': 2, 'models': 2, 'accessibility': 2, 'ml': 2, 'services': 2, 'retriever': 2, 'coverage': 2, 'designing': 2, 'hugging': 2, 'price': 2, 'prediction': 2, 'individuals': 2, 'tool': 2, 'classifier': 2, 'dataset': 2, 'employing': 2, 'portfolio': 2, 'mvc': 2, 'java': 2, 'scalable': 2

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pranjalrane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranjalrane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
keywords_set = list(df[0:1]['keywords'].str.split(',').apply(pd.Series).stack().reset_index(drop=True))
print(len(keywords_set))

6


In [5]:
import numpy as np
from Levenshtein import distance
from collections import Counter

def is_similar(word1, word2, threshold=3):
    return distance(word1, word2) <= threshold


def find_matching_keywords_with_counts(job_keywords, resume_keywords, similarity_threshold=2):
    matching_counts = Counter()

    for job_keyword in job_keywords:
        for resume_keyword in resume_keywords:
            if is_similar(job_keyword, resume_keyword, similarity_threshold):
                # matching_keywords.append(job_keyword)
                matching_counts[job_keyword] += 1
                # break  # Break to the next job keyword if a match is found

    matching_dict = dict(matching_counts)
    return matching_dict


matching_dict = find_matching_keywords_with_counts(user_keyword_count, keywords_set)
matching_dict

{'rag': 1,
 'sql': 1,
 'aws': 1,
 'ai': 1,
 'statistics': 1,
 'sep': 1,
 'may': 1,
 'jan': 1,
 'mask': 1,
 'ds': 1,
 'css': 1,
 'rds': 1,
 'iam': 1,
 'mathematical': 1}

### Creating Candidate & Job Embeddings & Recommendation module

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def recommend_jobs(candidate_data, jobs_data, vectorizer_type='tfidf', num_recommendations=3):
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer()
    elif vectorizer_type == 'hashing':
        vectorizer = HashingVectorizer(n_features=1000, norm=None)
    else:
        vectorizer = TfidfVectorizer()

    job_embeddings = vectorizer.fit_transform(jobs_data['keywords'])
    candidate_embeddings = vectorizer.transform(candidate_data['Skills'])
    

    similarity_scores = cosine_similarity(candidate_embeddings, job_embeddings)

    top_jobs_indices = similarity_scores.argsort(axis=1)[:, -num_recommendations:][:, ::-1]

    recommendation_frames = []

    for i, candidate_id in enumerate(candidate_data['Candidate_ID']):
        for j, job_index in enumerate(top_jobs_indices[i]):
            job = jobs_data.iloc[job_index]
            similarity = similarity_scores[i, job_index]

            recommendation_frames.append(pd.DataFrame({
                'Candidate_ID': [candidate_id],
                'Company': [job['Company']],
                'Job_Title': [job['Job Title']],
                'Similarity': [similarity]
            }))

    recommendations_df = pd.concat(recommendation_frames, ignore_index=True)

    recommendations_df['Rank'] = recommendations_df.groupby('Candidate_ID')['Similarity'].rank(ascending=False, method='max')

    top_recommendations_df = recommendations_df[recommendations_df['Rank'] <= num_recommendations]

    return top_recommendations_df


In [7]:
pdf_path_candidate_1 = 'Pranjal_Rane_Resume.pdf'
candidate_1_keywords = ', '.join(list(extract_keywords_from_pdf(pdf_path_candidate_1).keys()))

In [8]:
pdf_path_candidate_2 = 'Harhsit_Pandey_Resume.pdf'
candidate_2_keywords = ', '.join(list(extract_keywords_from_pdf(pdf_path_candidate_2).keys()))

In [9]:
candidates_data = pd.DataFrame({
    'Candidate_ID': [1, 2],
    'Skills': [candidate_1_keywords,
               candidate_2_keywords],
    'Wants_Remote': [True, False],
    'Experience_Level': ['entry', 'none'],
    'Salary_Range': [50.0, 40.0],
    'Facilities': ['none', 'career development'],
    'Job_Type': ['full time', 'full time']
})

recommendations = recommend_jobs(candidates_data, df, vectorizer_type='count')
recommendations

Unnamed: 0,Candidate_ID,Company,Job_Title,Similarity,Rank
0,1,atlassian,machine learning engineer,0.408248,1.0
1,1,docugami,ml engineer,0.387298,3.0
2,1,ethos,machine learning engineer,0.387298,3.0
3,2,atlassian,machine learning engineer,0.40161,1.0
4,2,plantix,senior geospatial machine learning engineer (f...,0.381,3.0
5,2,angi,"senior product manager, large language model",0.381,3.0
