In [1]:
#Import the necessary libraries
import re
import string
import nltk
from nltk.corpus import stopwords

In [5]:
# Make sure you have stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#Remove the trailing spaces and special characters
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove unwanted characters (keep words and numbers)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords
    words = text.split()
    words = [w for w in words if w not in STOPWORDS]
    
    return ' '.join(words)

In [7]:
def preprocess_resume_corpus(resume_texts):
    """
    resume_texts: list of resume strings
    returns: list of cleaned resume strings
    """
    cleaned_resumes = [clean_text(resume) for resume in resume_texts]
    return cleaned_resumes


In [11]:
def preprocess_jd_corpus(jd_df):
    """
    jd_df: pandas DataFrame containing the job descriptions and skills columns
    returns: list of cleaned JD strings
    """
    combined_texts = (jd_df['Job Description'].fillna('') + ' ' + jd_df['skills'].fillna('')).tolist()
    cleaned_jds = [clean_text(jd_text) for jd_text in combined_texts]
    return cleaned_jds

In [8]:
# Path to your resumes file
file_path = "./resume_corpus.txt"

# Read the entire file
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Split based on the separator used between resumes
resume_corpus = text.split('----------------------------------------')

# Remove empty strings and strip whitespace
resume_corpus = [resume.strip() for resume in resume_corpus if resume.strip()]

print(f"Loaded {len(resume_corpus)} resumes.")
print(resume_corpus[0][:500])  # Print part of the first resume


Loaded 962 resumes.
--- Resume 1 ---
Skills:
* Machine learning, Deep learning, scikit-learn, JavaScript/JQuery, SqlServer



INFORMATION GOVERNANCE

Organizations, Python, Java, HTML, JavaScript, Python Flask, JAVASCRIPT-, JavaScript/JQuery

Experience Sentences:
MULTIPLE DATA SCIENCE AND ANALYTIC PROJECTS (USA CLIENTS)

TEXT ANALYTICS - MOTOR VEHICLE CUSTOMER REVIEW DATA * Received customer feedback survey data for past one year.


In [12]:
import pandas as pd
# Load your JD CSV
jd_df = pd.read_csv('job_descriptions.csv')

# Preprocess
cleaned_resumes = preprocess_resume_corpus(resume_corpus)
cleaned_jds = preprocess_jd_corpus(jd_df)

print(cleaned_resumes[0])  # First cleaned resume
print(cleaned_jds[0])      # First cleaned JD

resume 1 skills machine learning deep learning scikit learn javascript jquery sqlserver information governance organizations python java html javascript python flask javascript javascript jquery experience sentences multiple data science analytic projects usa clients text analytics motor vehicle customer review data received customer feedback survey data past one year
social media managers oversee organizations social media presence create schedule content engage followers analyze social media metrics drive brand awareness engagement social media platforms e g facebook twitter instagram content creation scheduling social media analytics insights community engagement paid social advertising


<h1>Vectorize Texts using TF-IDF</h1>

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
combined_corpus = cleaned_resumes+cleaned_jds

In [14]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # you can tune max_features

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(combined_corpus)

# Split back into resumes and JDs
resume_vectors = tfidf_matrix[:len(resume_corpus)]
jd_vectors = tfidf_matrix[len(resume_corpus):]

print("Resume Vectors Shape:", resume_vectors.shape)
print("JD Vectors Shape:", jd_vectors.shape)

Resume Vectors Shape: (962, 3846)
JD Vectors Shape: (1615940, 3846)


<h1>Compute Cosine Similarity</h1>

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

batch_size = 5000  # How many JDs to process at once
top_k = 5          # How many top JDs you want per resume

n_resumes = resume_vectors.shape[0]
n_jds = jd_vectors.shape[0]

# To store top-k matches
top_matches = [[] for _ in range(n_resumes)]

for i in range(0, n_jds, batch_size):
    jd_batch = jd_vectors[i:i+batch_size]
    print(f"Processing JD batch {i} to {i + jd_batch.shape[0]}...")

    batch_similarity = cosine_similarity(resume_vectors, jd_batch)  # (962, batch_size)
    
    for resume_idx in range(n_resumes):
        sim_scores = batch_similarity[resume_idx]
        jd_indices_in_batch = np.arange(i, i + jd_batch.shape[0])
        
        # Combine indices and scores
        combined = list(zip(jd_indices_in_batch, sim_scores))
        
        # Extend current matches
        top_matches[resume_idx].extend(combined)
        
        # Keep only top-k highest scores
        top_matches[resume_idx] = sorted(top_matches[resume_idx], key=lambda x: x[1], reverse=True)[:top_k]

print("Top matches computed.")

# Example of top matches
for idx, matches in enumerate(top_matches[:5]):
    print(f"\nResume {idx+1} top {top_k} matches:")
    for jd_idx, score in matches:
        print(f"   JD {jd_idx} with similarity score {score:.4f}")


Processing JD batch 0 to 5000...
Processing JD batch 5000 to 10000...
Processing JD batch 10000 to 15000...
Processing JD batch 15000 to 20000...
Processing JD batch 20000 to 25000...
Processing JD batch 25000 to 30000...
Processing JD batch 30000 to 35000...
Processing JD batch 35000 to 40000...
Processing JD batch 40000 to 45000...
Processing JD batch 45000 to 50000...
Processing JD batch 50000 to 55000...
Processing JD batch 55000 to 60000...
Processing JD batch 60000 to 65000...
Processing JD batch 65000 to 70000...
Processing JD batch 70000 to 75000...
Processing JD batch 75000 to 80000...
Processing JD batch 80000 to 85000...
Processing JD batch 85000 to 90000...
Processing JD batch 90000 to 95000...
Processing JD batch 95000 to 100000...
Processing JD batch 100000 to 105000...
Processing JD batch 105000 to 110000...
Processing JD batch 110000 to 115000...
Processing JD batch 115000 to 120000...
Processing JD batch 120000 to 125000...
Processing JD batch 125000 to 130000...
Proce

In [20]:
import json

# Prepare the top_matches in a serializable format
serializable_matches = []

for resume_idx, matches in enumerate(top_matches):
    serializable_matches.append({
        "resume_index": resume_idx,
        "top_matches": [
            {"jd_index": int(jd_idx), "similarity_score": float(score)}
            for jd_idx, score in matches
        ]
    })

# Save to a JSON file
output_path = "top_matches.json"

with open(output_path, "w") as f:
    json.dump(serializable_matches, f, indent=4)

print(f"Top matches saved to {output_path}")

Top matches saved to top_matches.json


<h1>Rank the matches</h1>

In [22]:
# Step 1: Rank JDs for each resume by similarity score
ranked_matches = {}

# Rank the top-k matches for each resume
for entry in serializable_matches:
    resume_idx = entry["resume_index"]
    
    # Sort matches by similarity score (descending)
    sorted_matches = sorted(entry["top_matches"], key=lambda x: x["similarity_score"], reverse=True)
    
    ranked_matches[resume_idx] = sorted_matches

# Example: Display the ranked matches for a specific resume (e.g., Resume 1)
print(f"\nRanked matches for Resume 1:")
for match in ranked_matches[0]:
    print(f"JD {match['jd_index']} with similarity score {match['similarity_score']:.4f}")


Ranked matches for Resume 1:
JD 328 with similarity score 0.1802
JD 1193 with similarity score 0.1802
JD 2157 with similarity score 0.1802
JD 2226 with similarity score 0.1802
JD 2539 with similarity score 0.1802


In [23]:
# Step 2: Match the best JD (top-ranked match) to each resume
resume_to_best_jd = {}

for resume_idx, matches in ranked_matches.items():
    # Select the top-ranked JD for each resume (the first match after sorting)
    best_match = matches[0]
    resume_to_best_jd[resume_idx] = {
        "jd_index": best_match["jd_index"],
        "similarity_score": best_match["similarity_score"]
    }

# Example: Display the best match for a specific resume (e.g., Resume 1)
print(f"\nBest JD for Resume 1: JD {resume_to_best_jd[0]['jd_index']} with similarity score {resume_to_best_jd[0]['similarity_score']:.4f}")


Best JD for Resume 1: JD 328 with similarity score 0.1802


In [None]:
import spacy
import pandas as pd

# Load the spaCy NER model
nlp = spacy.load('en_core_web_sm')

# Path to your resumes file
resume_file_path = "./resume_corpus.txt"

# Read the entire file
with open(resume_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Split based on the separator used between resumes
resume_corpus = text.split('----------------------------------------')

# Remove empty strings and strip whitespace
resume_corpus = [resume.strip() for resume in resume_corpus if resume.strip()]

print(f"Loaded {len(resume_corpus)} resumes.")
print(resume_corpus[0][:500])  # Print part of the first resume

# Load your JD CSV
jd_df = pd.read_csv('job_descriptions.csv')

# Function to extract skills using NER (via spaCy)
def extract_skills_with_ner(text):
    doc = nlp(text)
    # Extract entities that are relevant to skills, we assume 'ORG' for organizations, and 'SKILL' for skills
    skills = set()
    
    # For the sake of example, consider entities like "ORG" or "WORK_OF_ART" as possible skill indicators
    # You can refine this logic further based on your needs
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT', 'WORK_OF_ART']:  # Adjust entity types as necessary
            skills.add(ent.text.lower())
    
    return skills

# Perform Skill Gap Analysis for each resume and JD
gap_analysis_results = []

for resume_idx, resume_text in enumerate(resume_corpus):
    # Extract skills from the resume
    resume_skills = extract_skills_with_ner(resume_text)
    
    for jd_idx, jd_row in jd_df.iterrows():
        # Convert JD skills (string) to a set (split by commas if necessary)
        jd_skills = set(jd_row['skills'].lower().split(','))  # Assuming skills are comma-separated in the 'skills' column
        
        # Perform skill gap analysis: Find skills in JD but not in resume
        missing_skills = jd_skills - resume_skills
        
        # Store the results for this resume-JD pair
        gap_analysis_results.append({
            "resume_index": resume_idx,
            "jd_index": jd_idx,
            "missing_skills": list(missing_skills)
        })

# Display the results (for example, for the first resume and JD)
for result in gap_analysis_results[:5]:  # Displaying the first 5 results
    print(f"\nResume {result['resume_index'] + 1} vs JD {result['jd_index'] + 1}:")
    print(f"Missing Skills: {', '.join(result['missing_skills'])}")

Loaded 962 resumes.
--- Resume 1 ---
Skills:
* Machine learning, Deep learning, scikit-learn, JavaScript/JQuery, SqlServer



INFORMATION GOVERNANCE

Organizations, Python, Java, HTML, JavaScript, Python Flask, JAVASCRIPT-, JavaScript/JQuery

Experience Sentences:
MULTIPLE DATA SCIENCE AND ANALYTIC PROJECTS (USA CLIENTS)

TEXT ANALYTICS - MOTOR VEHICLE CUSTOMER REVIEW DATA * Received customer feedback survey data for past one year.
