## Import Libraries

In [1]:
import re
import os
import pandas as pd
import subprocess
import PyPDF2

import spacy
from spacy.pipeline import EntityRuler
from spacy import displacy

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Convert PDF to CSV

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ''
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Path to the 'data' folder containing the PDF resumes
data_folder = 'data'

# List of PDF file names containing resumes
pdf_files = [
    'CV_Kanika.pdf',
    'CV_Mythily.pdf',
    'CV_Nikhil.pdf'
]

# Full paths to the PDF files
pdf_paths = [os.path.join(data_folder, pdf_file) for pdf_file in pdf_files]

# Extract text from each PDF resume and store it in a list
resumes_text = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_paths]

# Create a DataFrame with columns 'ID' and 'resume_text'
data = pd.DataFrame({'ID': range(1, len(pdf_files) + 1), 'resume_text': resumes_text})

# Path to save the CSV file
output_csv_path = os.path.join(data_folder, 'resumes.csv')

# Save the DataFrame to a CSV file
data.to_csv(output_csv_path, index=False)

print(f"Resumes saved to {output_csv_path}")

Resumes saved to data/resumes.csv


## Loading the Data and NER model

In [3]:
# Load data from CSV file
data = pd.read_csv('data/resumes.csv')

## Downlaod and Load NER model

In [4]:
def suppress_output(command):
    with open(os.devnull, 'w') as devnull:
        subprocess.run(command, stdout=devnull, stderr=devnull)

# Suppress the output of the spacy download command
suppress_output(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

## Entity Ruler

* Entity ruler pipeline is addded to the spaCy model
* Create an entity ruler using a JSON file containing labels and patterns for skills

In [7]:
# Add entity ruler pipeline to spaCy model
entity_ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define patterns as dictionaries
patterns = [
    {"label": "SKILL", "pattern": [{"LOWER": "matplotlib"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "python"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "pandas"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "seaborn"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "sql"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "mysql"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "kanika kaushik"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "mythily ramanathan"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "nikhil thota"}]}
]

# Add patterns to entity ruler
entity_ruler.add_patterns(patterns)

In [8]:
data

Unnamed: 0,ID,resume_text
0,1,\n \n \n \n \n \n \n \n \n \nCONTACT \nKanik...
1,2,\n \n \n \nSUMMARY Mythily Ramanathan \nDa...
2,3,"Nikhil\nThota \nSan\nJose,\nCA\n95126\n•\nnikh..."


## Text Cleaning

In [9]:
# Download NLTK resources
nltk.download('punkt')  # Download the 'punkt' tokenizer resource
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove hyperlinks, special characters, and punctuations using regex
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s\n]', '', text)

    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text using nltk's word_tokenize
    words = word_tokenize(text)

    # Lemmatize the text to its base form for normalization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Remove English stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = ' '.join([word for word in lemmatized_words if word not in stop_words])

    return filtered_words

# Clean the 'resume_text' column in the DataFrame
data['cleaned_resume'] = data['resume_text'].apply(clean_text)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Visualizing Named Entities in Text with spaCy

In [10]:
# Define options for visualization
options = {'ents': ['PERSON', 'GPE', 'SKILL'],
           'colors': {'PERSON': 'orange',
                      'GPE': 'lightgreen',
                      'SKILL': 'lightblue'}}

# Visualize named entities in each resume
for resume_text in data['cleaned_resume']:
    doc = nlp(resume_text)
    displacy.render(doc, style="ent", jupyter=True, options=options)
    print('\n\n')
















## Match Score

In [11]:
# Define the company requirements
company_requirements = """Data Analyst with experience using Python for data cleaning, data analysis, exploratory data analysis (EDA).
                          We are also looking for someone with the ability to explain complex mathematical concepts to non-mathematicians."""

# Combine the company requirements with stopwords removed
cleaned_company_requirements = clean_text(company_requirements)

# Calculate TF-IDF vectors for the company requirements and resume texts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_resume'])
company_tfidf = tfidf_vectorizer.transform([cleaned_company_requirements])

# Calculate cosine similarity between the company requirements and each resume
similarity_scores = cosine_similarity(company_tfidf, tfidf_matrix).flatten()

# Get the indices of resumes sorted by similarity score
sorted_indices = similarity_scores.argsort()[::-1]

# Display the top 5 most similar resumes
top_n = 3
for i in range(top_n):
    index = sorted_indices[i]
    print(f"Resume ID: {data['ID'][index]}")
    print(f"Similarity Score: {similarity_scores[index]}")
    print()

Resume ID: 2
Similarity Score: 0.6011826053120937

Resume ID: 3
Similarity Score: 0.33784127053908547

Resume ID: 1
Similarity Score: 0.07674889561956494



## Skill Extractor Function

In [12]:
def calculate_similarity(resume_text, required_skills):
    # Process the resume text with the spaCy model
    doc = nlp(resume_text)

    # Extract skills from the resume using the entity ruler
    skills = [ent.text.lower() for ent in doc.ents if ent.label_ == "SKILL"]

    # Calculate the number of matching skills with required skills
    matching_skills = [skill for skill in skills if skill in required_skills]
    num_matching_skills = len(matching_skills)

    # Calculate the similarity score
    similarity_score = num_matching_skills / max(len(required_skills), len(skills))

    return similarity_score

In [13]:
for index, resume_text in data[['cleaned_resume']].itertuples():
  print(f"Resume ID: {data['ID'][index]}")
  required_skills = ["matplotlib", "python", "pandas", "seaborn", "sql", "mysql"]
  similarity_score = calculate_similarity(resume_text, required_skills)
  print("Similarity Score:", similarity_score)

Resume ID: 1
Similarity Score: 0.16666666666666666
Resume ID: 2
Similarity Score: 1.0
Resume ID: 3
Similarity Score: 1.0
