In [1]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df_resumes = pd.read_csv(r"D:\Internships\Pantech\projects\resume-screening\ResumeDataSet.csv")
df_jobs = pd.read_csv(r"D:\Internships\Pantech\projects\resume-screening\jd.csv")

In [4]:
df_resumes.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [5]:
df_jobs.head()

Unnamed: 0,title,desc,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70
0,web developer,a leading ecommerce agency is looking to hire ...,,,,,,,,,...,,,,,,,,,,
1,software engineer,"software engineer c, c++, java, uml, xamp, agi...",,,,,,,,,...,,,,,,,,,,
2,user experience developer,user experience developer our client requires ...,,,,,,,,,...,,,,,,,,,,
3,web developer,web developer our client is looking for experi...,,,,,,,,,...,,,,,,,,,,
4,c software developers,c developer belfast salary up to ****k pa our ...,,,,,,,,,...,,,,,,,,,,


In [6]:
df_jobs = df_jobs[["title", "desc"]]
df_jobs.head()

Unnamed: 0,title,desc
0,web developer,a leading ecommerce agency is looking to hire ...
1,software engineer,"software engineer c, c++, java, uml, xamp, agi..."
2,user experience developer,user experience developer our client requires ...
3,web developer,web developer our client is looking for experi...
4,c software developers,c developer belfast salary up to ****k pa our ...


In [7]:
print(df_resumes.isnull().sum())
print(df_jobs.isnull().sum())

Category    0
Resume      0
dtype: int64
title    0
desc     0
dtype: int64


In [8]:
resume_category_counts = df_resumes['Category'].value_counts().to_dict()
job_title_counts = df_jobs['title'].value_counts().to_dict()

print("Resume:", resume_category_counts)
print("Len:", len(resume_category_counts))

print("\nJob Titles:", job_title_counts)
print("Len", len(job_title_counts))

Resume: {'Java Developer': 84, 'Testing': 70, 'DevOps Engineer': 55, 'Python Developer': 48, 'Web Designing': 45, 'HR': 44, 'Hadoop': 42, 'Blockchain': 40, 'ETL Developer': 40, 'Operations Manager': 40, 'Data Science': 40, 'Sales': 40, 'Mechanical Engineer': 40, 'Arts': 36, 'Database': 33, 'Electrical Engineering': 30, 'Health and fitness': 30, 'PMO': 30, 'Business Analyst': 28, 'DotNet Developer': 28, 'Automation Testing': 26, 'Network Security Engineer': 25, 'SAP Developer': 24, 'Civil Engineer': 24, 'Advocate': 20}
Len: 25

Job Titles: {'web developer': 435, 'software engineer': 310, 'software developer': 263, 'embedded software engineer': 186, 'php web developer': 104, 'senior software developer': 70, 'c++ software engineer': 68, 'systems developer': 62, 'software tester': 56, 'application developer': 52, 'software development manager': 46, 'aspnet web developer': 36, 'graduate software developer': 34, 'software development engineer': 32, 'frontend web developer': 26, 'web applicat

In [9]:
def preprocess_text(text):
    text = str(text)  # Ensure input is a string
    text = text.lower().strip()  # Convert to lowercase and strip spaces
    text = re.sub(r'\r\n|\n|\r', ' ', text)  # Remove newlines (\r\n, \n, \r)
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)  # Remove dates (DD/MM/YYYY, MM-DD-YYYY)
    text = re.sub(r'\b\d{4}\b', '', text)  # Remove standalone years (e.g., 2023)
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters except spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [10]:
df_resumes['cleaned_resume'] = df_resumes['Resume'].apply(preprocess_text)
df_jobs['cleaned_description'] = df_jobs['desc'].apply(preprocess_text)

In [11]:
def clean_and_tokenize(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    doc = nlp(text)  # Process text with Spacy
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatize & remove stopwords
    return tokens 

In [42]:
SKILLS_DB = {
    # --- Programming Languages ---
    'python', 'java', 'c', 'c++', 'c#', 'javascript', 'typescript', 'ruby', 'php', 'swift', 'go', 'rust', 
    'kotlin', 'r', 'scala', 'perl', 'dart', 'haskell', 'lua', 'matlab', 'groovy', 'shell', 'bash', 
    'powershell', 'objective-c', 'f#', 'elixir', 'julia', 'vb.net', 'fortran', 'cobol',

    # --- Web Development ---
    'html', 'css', 'sass', 'less', 'javascript', 'typescript', 'jquery', 'bootstrap', 'tailwind', 'foundation', 'js',

    # --- Frontend Frameworks ---
    'react', 'angular', 'vue', 'svelte', 'next.js', 'nuxt.js', 'lit', 'solidjs', 'alpine.js', 'ember.js',

    # --- Backend Frameworks ---
    'node.js', 'express.js', 'django', 'flask', 'spring boot', 'asp.net', 'laravel', 'ruby on rails', 'fastapi',
    'phoenix', 'gin', 'fiber', 'nest.js', 'adonis.js', 'ktor',

    # --- Databases ---
    'mysql', 'postgresql', 'sqlite', 'mongodb', 'cassandra', 'couchdb', 'redis', 'firebase', 'oracle', 
    'sql server', 'db2', 'mariadb', 'cockroachdb', 'tidb', 'neo4j', 'arangodb', 'elasticsearch', 'sql',

    # --- Cloud & DevOps ---
    'aws', 'azure', 'gcp', 'heroku', 'digitalocean', 'ibm cloud', 'oracle cloud', 'terraform', 'ansible',
    'kubernetes', 'docker', 'jenkins', 'gitlab ci/cd', 'github actions', 'circleci', 'travis ci', 'argo cd',
    'helm', 'istio', 'nomad', 'vault', 'prometheus', 'grafana', 'splunk', 'datadog',

    # --- Version Control & CI/CD ---
    'git', 'github', 'gitlab', 'bitbucket', 'svn', 'mercurial', 'perforce',

    # --- Artificial Intelligence & Machine Learning ---
    'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'xgboost', 'lightgbm', 'catboost', 'huggingface', 'openai',
    'nltk', 'spacy', 'gensim', 'fastai', 'onnx', 'opencv', 'detectron2', 'mxnet', 'chainer', 'caffe', 'ai', 'ml', 'r',

    # --- Big Data & Analytics ---
    'hadoop', 'spark', 'hive', 'pig', 'presto', 'trino', 'kafka', 'flink', 'storm', 'hbase', 'cassandra', 
    'redshift', 'bigquery', 'snowflake', 'druid', 'clickhouse',

    # --- Business Intelligence & Visualization ---
    'tableau', 'power bi', 'looker', 'metabase', 'superset', 'google data studio', 'd3.js', 'plotly', 'ggplot2',
    'seaborn', 'matplotlib',

    # --- Cybersecurity & Networking ---
    'nmap', 'metasploit', 'burp suite', 'wireshark', 'snort', 'zeek', 'ossec', 'nessus', 'suricata', 'chkrootkit',
    'openvas', 'aircrack-ng', 'hashcat', 'john the ripper', 'sqlmap', 'autopsy', 'volatility', 'mimikatz',
    'splunk', 'elk stack', 'zeek', 'cyberark', 'crowdstrike', 'tenable', 'okta',

    # --- Blockchain & Web3 ---
    'ethereum', 'solidity', 'rust', 'bitcoin', 'polygon', 'hyperledger', 'ipfs', 'cosmos', 'substrate', 
    'binance smart chain', 'smart contracts', 'web3.js', 'ethers.js',

    # --- IoT & Embedded Systems ---
    'arduino', 'raspberry pi', 'esp8266', 'esp32', 'micropython', 'zephyr', 'mbed os', 'free rtos', 'tinyml',

    # --- Mobile Development ---
    'flutter', 'react native', 'swift', 'kotlin', 'xamarin', 'jetpack compose', 'ionic', 'cordova',

    # --- Game Development ---
    'unity', 'unreal engine', 'godot', 'cryengine', 'game maker studio', 'cocos2d', 'phaser.js',

    # --- Robotics & Automation ---
    'ros', 'gazebo', 'openai gym', 'pybullet', 'webots', 'autoware', 'robosuite',

    # --- Enterprise Tools & ERP ---
    'sap', 'salesforce', 'oracle ebs', 'workday', 'netsuite', 'zoho crm', 'microsoft dynamics',

    # --- APIs & Protocols ---
    'graphql', 'rest api', 'soap', 'grpc', 'json', 'xml', 'rpc', 'websockets',

    # --- Scripting & Automation ---
    'bash', 'powershell', 'ansible', 'puppet', 'chef', 'saltstack',

    # --- Miscellaneous & Emerging Technologies ---
    'quantum computing', 'qiskit', 'ibm q', 'google cirq', 'dna computing', 'edge ai', 'tinyml',

    # --- Office & Productivity Tools ---
    'excel', 'vba', 'google sheets', 'notion', 'trello', 'jira', 'confluence'
}


print("Number of Unique Titles in SKILLS_DB:", len(SKILLS_DB))

Number of Unique Titles in SKILLS_DB: 257


In [13]:
def extract_skills(text):
    tokens = clean_and_tokenize(text)  # Ensure resume text is tokenized
    extracted_skills = [word for word in tokens if word in SKILLS_DB]  # Match with SKILLS_DB
    return list(set(extracted_skills))  # Remove duplicates

df_resumes['extracted_skills'] = df_resumes['Resume'].apply(extract_skills)

df_resumes.head()

Unnamed: 0,Category,Resume,cleaned_resume,extracted_skills
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[git, mysql, matplotlib, jquery, docker, table..."
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may to may be uitrgpv data s...,"[keras, ml, python]"
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...,"[flask, matlab, python, excel, github, java]"
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[python, swift, tableau, c, r, sap]"
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,"[java, python, c]"


In [14]:
print(df_resumes['extracted_skills'][0])

['git', 'mysql', 'matplotlib', 'jquery', 'docker', 'tableau', 'hbase', 'java', 'spacy', 'flask', 'html', 'javascript', 'python', 'angular', 'css', 'cassandra', 'bootstrap', 'elasticsearch', 'plotly', 'nltk', 'kafka']


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def compute_resume_score(resume_text, job_desc_text):
    
    cleaned_resume = preprocess_text(resume_text)
    cleaned_job_desc = preprocess_text(job_desc_text)

    resume_skills = " ".join(extract_skills(cleaned_resume))  # Convert list to string
    job_skills = " ".join(extract_skills(cleaned_job_desc))

    if not resume_skills or not job_skills:
        return 0  # If no skills are extracted, return 0 score

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([resume_skills, job_skills])

    # Compute cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    # Scale the score to 1-10
    score = round(similarity * 10, 1)  
    return max(1, min(10, score))

In [17]:
user_resume = input("Paste your resume: ")
user_job_desc = input("Paste the job description: ")

resume_score = compute_resume_score(user_resume, user_job_desc)
print(f"\n💡 Resume Score: {resume_score}/10")

Paste your resume:  python opencv pandas 
Paste the job description:  software developer efficient in python



💡 Resume Score: 5.8/10


In [18]:
user_resume = input("Paste your resume: ")
user_job_desc = input("Paste the job description: ")

resume_score = compute_resume_score(user_resume, user_job_desc)
print(f"\n💡 Resume Score: {resume_score}/10")

Paste your resume:  None
Paste the job description:  web developer



💡 Resume Score: 0/10


In [19]:
df_jobs['extracted_skills'] = df_jobs['cleaned_description'].apply(extract_skills)

In [20]:
df_jobs.head()

Unnamed: 0,title,desc,cleaned_description,extracted_skills
0,web developer,a leading ecommerce agency is looking to hire ...,a leading ecommerce agency is looking to hire ...,"[xml, html, javascript]"
1,software engineer,"software engineer c, c++, java, uml, xamp, agi...",software engineer c c java uml xamp agile defe...,"[java, html, php, c]"
2,user experience developer,user experience developer our client requires ...,user experience developer our client requires ...,"[css, html, javascript]"
3,web developer,web developer our client is looking for experi...,web developer our client is looking for experi...,"[css, html]"
4,c software developers,c developer belfast salary up to ****k pa our ...,c developer belfast salary up to k pa our clie...,[c]


In [21]:
print(df_jobs[df_jobs['extracted_skills'].apply(lambda x: len(x) == 0)])

                                                  title  \
8                                net software developer   
14                                    software engineer   
19                            systems analyst developer   
21    software consultant  financial markets softwar...   
25                         application server developer   
...                                                 ...   
5411                       software development manager   
5418         technical it trainer  software development   
5419                  trainee web application developer   
5423                     business application developer   
5426                             new business developer   

                                                   desc  \
8     .net developer in partnership with its client ...   
14    software engineer ktp associate university of ...   
19    i am currently looking for a systems analyst d...   
21    software consultant  financial markets trading...

In [22]:
print(df_resumes[df_resumes['extracted_skills'].apply(lambda x: len(x) == 0)])

    Category                                             Resume  \
40        HR  TECHNICAL SKILLS â¢ Typewriting â¢ TORA â¢ ...   
42        HR  Education Details \r\n BA   mumbai University\...   
43        HR  Education Details \r\nJune 2012 to May 2015 B....   
44        HR  Education Details \r\nJune 2012 to May 2015 B....   
45        HR  Education Details \r\n BBA   lovely profession...   
..       ...                                                ...   
945  Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...   
948  Testing  â¢ Good logical and analytical skills â¢ Pos...   
952  Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...   
955  Testing  â¢ Good logical and analytical skills â¢ Pos...   
959  Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...   

                                        cleaned_resume extracted_skills  
40   technical skills typewriting tora spsseducatio...               []  
42   education details ba mumbai university hr 

In [23]:
df_jobs["extracted_skills"] = df_jobs["extracted_skills"].apply(lambda x: ["None"] if not x else x)
df_resumes["extracted_skills"] = df_resumes["extracted_skills"].apply(lambda x: ["None"] if not x else x)

# PDF processing

In [28]:
import re
import spacy
import docx2txt
import PyPDF2

In [43]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract text from Word document
def extract_text_from_word(doc_path):
    return docx2txt.process(doc_path)

# Preprocessing function
def preprocess_text(text):
    text = text.lower().strip()  # Convert to lowercase and remove extra spaces
    text = re.sub(r'\r\n|\n|\r', ' ', text)  # Remove newlines
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)  # Remove dates
    text = re.sub(r'\b\d{4}\b', '', text)  # Remove standalone years
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    return text

# Function to clean and tokenize text
def clean_and_tokenize(text):
    doc = nlp(text)  
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatize & remove stopwords
    return tokens 

# Function to extract skills
def extract_skills(text):
    tokens = clean_and_tokenize(text)
    extracted_skills = [word for word in tokens if word in SKILLS_DB]

    # Additional regex-based matching
    for skill in SKILLS_DB:
        if re.search(rf'\b{skill}\b', text):  # Match whole words
            extracted_skills.append(skill)

    return list(set(extracted_skills))

# Function to process a document and extract skills
def process_document(file_path):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_word(file_path)
    else:
        raise ValueError("Unsupported file format! Please provide a PDF or DOCX file.")
    
    text = preprocess_text(text)
    skills = extract_skills(text)
    return skills

In [30]:
file_path = "D:/Prerana/Professional/progressResume.docx"
extracted_skills = process_document(file_path)
print("Extracted Skills:", extracted_skills)

Extracted Skills: ['mysql', 'flask', 'html', 'python', 'react', 'excel', 'mongodb', 'opencv', 'spark', 'github', 'c', 'r', 'css', 'java']


In [31]:
file_path = "D:/Prerana/Professional/Prerana-Resume-Recent.pdf"
extracted_skills = process_document(file_path)
print("Extracted Skills:", extracted_skills)

Extracted Skills: ['mysql', 'flask', 'html', 'python', 'opencv', 'react', 'mongodb', 'excel', 'github', 'spark', 'c', 'r', 'css', 'java']


In [32]:
file_path = "D:/Prerana/Professional/Prerana_Resume.pdf"
extracted_skills = process_document(file_path)
print("Extracted Skills:", extracted_skills)

Extracted Skills: ['css', 'flask', 'html', 'mysql', 'python', 'react', 'excel', 'mongodb', 'spark', 'foundation', 'c', 'r', 'jira', 'java']


In [44]:
from collections import defaultdict

CATEGORY_MAPPING = {
    "Software Engineering": {"python", "java", "c++", "c#", "go", "swift", "kotlin", "ruby", "scala"},
    "Web Development": {"html", "css", "javascript", "react", "angular", "vuejs", "typescript", "nodejs", "django", "flask"},
    "Data Science": {"pandas", "numpy", "matplotlib", "scikit-learn", "tensorflow", "pytorch", "seaborn", "statsmodels"},
    "DevOps & Cloud": {"docker", "kubernetes", "aws", "azure", "gcp", "terraform", "ansible", "jenkins"},
    "Cybersecurity": {"penetration testing", "ethical hacking", "burp suite", "wireshark", "nmap", "metasploit"},
    "Database Management": {"sql", "mysql", "postgresql", "mongodb", "oracle", "redis", "cassandra"},
    "Mobile Development": {"android", "ios", "swift", "kotlin", "flutter", "react native"},
    "Machine Learning & AI": {"tensorflow", "pytorch", "scikit-learn", "opencv", "huggingface", "bert", "deep learning"},
    "Business Intelligence": {"power bi", "tableau", "excel", "qlikview", "looker"}
}

def categorize_skills(extracted_skills):
    skill_categories = defaultdict(list)
    
    for skill in extracted_skills:
        for category, skills in CATEGORY_MAPPING.items():
            if skill in skills:
                skill_categories[category].append(skill)
    
    return dict(skill_categories)

def process_document(file_path):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_word(file_path)
    else:
        raise ValueError("Unsupported file format! Please provide a PDF or DOCX file.")
    
    text = preprocess_text(text)
    skills = extract_skills(text)
    categorized_skills = categorize_skills(skills)
    
    return categorized_skills

In [36]:
file_path = "D:/Prerana/Professional/Prerana-Resume-Recent.pdf"
categorized_skills = process_document(file_path)
print("Categorized Skills:", categorized_skills)

Categorized Skills: {'Database Management': ['mysql', 'mongodb'], 'Web Development': ['flask', 'html', 'react', 'css'], 'Software Engineering': ['python', 'java'], 'Machine Learning & AI': ['opencv'], 'Business Intelligence': ['excel']}


In [45]:
def compute_resume_score(resume_text, job_description):
    vectorizer = TfidfVectorizer()
    documents = [resume_text, job_description]
    
    try:
        tfidf_matrix = vectorizer.fit_transform(documents)
    except ValueError:
        return 0  # Handle empty vocabulary error if input text has no meaningful words

    similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    return round(similarity_score * 10, 2)  # Scale score to 1-10

In [46]:
def process_resume_and_job(resume_file, job_description):
    # Extract text from resume
    if resume_file.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_file)
    elif resume_file.endswith(".docx"):
        resume_text = extract_text_from_word(resume_file)
    else:
        raise ValueError("Unsupported file format! Please provide a PDF or DOCX file.")

    # Preprocess text
    resume_text = preprocess_text(resume_text)
    job_description = preprocess_text(job_description)

    # Extract skills after preprocessing
    resume_skills = extract_skills(resume_text)
    job_skills = extract_skills(job_description)

    # Categorize skills
    categorized_resume_skills = categorize_skills(resume_skills)
    categorized_job_skills = categorize_skills(job_skills)

    # Compute similarity score
    resume_score = compute_resume_score(resume_text, job_description)

    # Return results
    return {
        "Resume Skills": resume_skills,
        "Categorized Resume Skills": categorized_resume_skills,
        "Job Skills": job_skills,
        "Categorized Job Skills": categorized_job_skills,
        "Resume Score": resume_score
    }

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

SKILL_EQUIVALENCE = {
    'mysql': 'sql',
    'js': 'javascript',
    'tf': 'tensorflow',
    'torch': 'pytorch',
    'c++': 'cpp',
    'c#': 'csharp'
}

In [51]:
def map_equivalent_skills(skill_list):
    return [SKILL_EQUIVALENCE.get(skill.lower(), skill.lower()) for skill in skill_list]

def calculate_resume_score(resume_skills, job_skills):
    
    resume_skills = list(set(resume_skills))
    job_skills = list(set(job_skills))
    
    # Compute Jaccard Similarity (Overlap Percentage)
    matched_skills = set(resume_skills) & set(job_skills)
    jaccard_similarity = len(matched_skills) / len(set(job_skills)) if job_skills else 0

    # Compute Cosine Similarity (TF-IDF)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([
        " ".join(resume_skills),
        " ".join(job_skills)
    ])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    # Final Score: Weighted Combination
    final_score = (5 * jaccard_similarity) + (5 * cosine_sim)

    return round(final_score, 2), matched_skills

In [52]:
resume_file = "D:/Prerana/Professional/Prerana-Resume-Recent.pdf"
job_description = """We are looking for a software engineer with experience in Python, Django, and AWS.
                     Knowledge of SQL and cloud infrastructure is a plus."""

resume_text = extract_text_from_pdf(resume_file)
cleaned_jd = preprocess_text(job_description)

# Extract Skills
resume_skills = extract_skills(resume_text)
job_skills = extract_skills(cleaned_jd)

# Categorize Skills
categorized_resume_skills = categorize_skills(resume_skills)
categorized_job_skills = categorize_skills(job_skills)

# Calculate Score
resume_score, matched_skills = calculate_resume_score(resume_skills, job_skills)


print(f"Extracted Resume Skills: {resume_skills}")
print(f"Categorized Resume Skills: {categorized_resume_skills}")
print(f"Extracted Job Skills: {job_skills}")
print(f"Categorized Job Skills: {categorized_job_skills}")
print(f"Matched Skills: {matched_skills}")
print(f"Final Resume Score: {resume_score} / 10")

Extracted Resume Skills: ['opencv', 'mongodb', 'github']
Categorized Resume Skills: {'Machine Learning & AI': ['opencv'], 'Database Management': ['mongodb']}
Extracted Job Skills: ['sql', 'aws', 'python', 'django']
Categorized Job Skills: {'Database Management': ['sql'], 'DevOps & Cloud': ['aws'], 'Software Engineering': ['python'], 'Web Development': ['django']}
Matched Skills: set()
Final Resume Score: 0.0 / 10


# PDF input and processing

In [60]:
import re
import PyPDF2
import docx2txt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
nlp = spacy.load("en_core_web_sm")

In [57]:
SKILLS_DB = {
    # --- Programming Languages ---
    'python', 'java', 'c', 'c++', 'c#', 'javascript', 'typescript', 'ruby', 'php', 'swift', 'go', 'rust', 
    'kotlin', 'r', 'scala', 'perl', 'dart', 'haskell', 'lua', 'matlab', 'groovy', 'shell', 'bash', 
    'powershell', 'objective-c', 'f#', 'elixir', 'julia', 'vb.net', 'fortran', 'cobol',

    # --- Web Development ---
    'html', 'css', 'sass', 'less', 'javascript', 'typescript', 'jquery', 'bootstrap', 'tailwind', 'foundation', 'js',

    # --- Frontend Frameworks ---
    'react', 'angular', 'vue', 'svelte', 'next.js', 'nuxt.js', 'lit', 'solidjs', 'alpine.js', 'ember.js',

    # --- Backend Frameworks ---
    'node.js', 'express.js', 'django', 'flask', 'spring boot', 'asp.net', 'laravel', 'ruby on rails', 'fastapi',
    'phoenix', 'gin', 'fiber', 'nest.js', 'adonis.js', 'ktor',

    # --- Databases ---
    'mysql', 'postgresql', 'sqlite', 'mongodb', 'cassandra', 'couchdb', 'redis', 'firebase', 'oracle', 
    'sql server', 'db2', 'mariadb', 'cockroachdb', 'tidb', 'neo4j', 'arangodb', 'elasticsearch', 'sql',

    # --- Cloud & DevOps ---
    'aws', 'azure', 'gcp', 'heroku', 'digitalocean', 'ibm cloud', 'oracle cloud', 'terraform', 'ansible',
    'kubernetes', 'docker', 'jenkins', 'gitlab ci/cd', 'github actions', 'circleci', 'travis ci', 'argo cd',
    'helm', 'istio', 'nomad', 'vault', 'prometheus', 'grafana', 'splunk', 'datadog',

    # --- Version Control & CI/CD ---
    'git', 'github', 'gitlab', 'bitbucket', 'svn', 'mercurial', 'perforce',

    # --- Artificial Intelligence & Machine Learning ---
    'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'xgboost', 'lightgbm', 'catboost', 'huggingface', 'openai',
    'nltk', 'spacy', 'gensim', 'fastai', 'onnx', 'opencv', 'detectron2', 'mxnet', 'chainer', 'caffe', 'ai', 'ml', 'r',

    # --- Big Data & Analytics ---
    'hadoop', 'spark', 'hive', 'pig', 'presto', 'trino', 'kafka', 'flink', 'storm', 'hbase', 'cassandra', 
    'redshift', 'bigquery', 'snowflake', 'druid', 'clickhouse',

    # --- Business Intelligence & Visualization ---
    'tableau', 'power bi', 'looker', 'metabase', 'superset', 'google data studio', 'd3.js', 'plotly', 'ggplot2',
    'seaborn', 'matplotlib',

    # --- Cybersecurity & Networking ---
    'nmap', 'metasploit', 'burp suite', 'wireshark', 'snort', 'zeek', 'ossec', 'nessus', 'suricata', 'chkrootkit',
    'openvas', 'aircrack-ng', 'hashcat', 'john the ripper', 'sqlmap', 'autopsy', 'volatility', 'mimikatz',
    'splunk', 'elk stack', 'zeek', 'cyberark', 'crowdstrike', 'tenable', 'okta',

    # --- Blockchain & Web3 ---
    'ethereum', 'solidity', 'rust', 'bitcoin', 'polygon', 'hyperledger', 'ipfs', 'cosmos', 'substrate', 
    'binance smart chain', 'smart contracts', 'web3.js', 'ethers.js',

    # --- IoT & Embedded Systems ---
    'arduino', 'raspberry pi', 'esp8266', 'esp32', 'micropython', 'zephyr', 'mbed os', 'free rtos', 'tinyml',

    # --- Mobile Development ---
    'flutter', 'react native', 'swift', 'kotlin', 'xamarin', 'jetpack compose', 'ionic', 'cordova',

    # --- Game Development ---
    'unity', 'unreal engine', 'godot', 'cryengine', 'game maker studio', 'cocos2d', 'phaser.js',

    # --- Robotics & Automation ---
    'ros', 'gazebo', 'openai gym', 'pybullet', 'webots', 'autoware', 'robosuite',

    # --- Enterprise Tools & ERP ---
    'sap', 'salesforce', 'oracle ebs', 'workday', 'netsuite', 'zoho crm', 'microsoft dynamics',

    # --- APIs & Protocols ---
    'graphql', 'rest api', 'soap', 'grpc', 'json', 'xml', 'rpc', 'websockets',

    # --- Scripting & Automation ---
    'bash', 'powershell', 'ansible', 'puppet', 'chef', 'saltstack',

    # --- Miscellaneous & Emerging Technologies ---
    'quantum computing', 'qiskit', 'ibm q', 'google cirq', 'dna computing', 'edge ai', 'tinyml',

    # --- Office & Productivity Tools ---
    'excel', 'vba', 'google sheets', 'notion', 'trello', 'jira', 'confluence'
}


print("Number of Unique Titles in SKILLS_DB:", len(SKILLS_DB))

Number of Unique Titles in SKILLS_DB: 257


In [72]:
CATEGORY_MAP = {
    "Software Engineering": {"python", "java", "c++", "c#", "go", "swift", "kotlin", "ruby", "scala"},
    "Web Development": {"html", "css", "javascript", "react", "angular", "vuejs", "typescript", "nodejs", "django", "flask"},
    "Data Science": {"pandas", "numpy", "matplotlib", "scikit-learn", "tensorflow", "pytorch", "seaborn", "statsmodels"},
    "DevOps & Cloud": {"docker", "kubernetes", "aws", "azure", "gcp", "terraform", "ansible", "jenkins"},
    "Cybersecurity": {"penetration testing", "ethical hacking", "burp suite", "wireshark", "nmap", "metasploit", "hping"},
    "Database Management": {"sql", "mysql", "postgresql", "mongodb", "oracle", "redis", "cassandra"},
    "Mobile Development": {"android", "ios", "swift", "kotlin", "flutter", "react native"},
    "Machine Learning & AI": {"tensorflow", "pytorch", "scikit-learn", "opencv", "huggingface", "bert", "deep learning", "python", "r", "ml", "ai"},
    "Business Intelligence": {"power bi", "tableau", "excel", "qlikview", "looker"}
}

In [59]:
SKILL_EQUIVALENCE = {
    'mysql': 'sql',
    'js': 'javascript',
    'tf': 'tensorflow',
    'torch': 'pytorch',
    'c++': 'cpp',
    'c#': 'csharp'
}

In [61]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyPDF2."""
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = " ".join([page.extract_text() or "" for page in reader.pages])
    except Exception as e:
        print(f"Error reading PDF: {e}")
    
    return text if text.strip() else "None"

def extract_text_from_word(doc_path):
    return docx2txt.process(doc_path)

In [73]:
import os 

def extract_text_from_file(file_path):
    """Extract text from PDF or Word file."""
    file_extension = os.path.splitext(file_path)[1].lower()  # Get file extension

    try:
        if file_extension == ".pdf":
            with open(file_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = " ".join([page.extract_text() or "" for page in reader.pages])
        elif file_extension in [".docx", ".doc"]:  # Handle both DOCX and older DOC formats
            text = docx2txt.process(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a PDF or Word document.")
        
        return text.strip() if text.strip() else "None"  # Handle empty extraction
    
    except Exception as e:
        print(f"Error reading file: {e}")
        return "None"

In [62]:
def preprocess_text(text):
    
    text = text.lower().strip()
    text = re.sub(r'\r\n|\n|\r', ' ', text)  # Remove newlines
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)  # Remove dates
    text = re.sub(r'\b\d{4}\b', '', text)  # Remove standalone years
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text if text.strip() else "None"

In [69]:
def extract_skills(text):
    """Extract technical skills from text using NLP and a predefined skills list."""
    text = preprocess_text(text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    # Convert CATEGORY_MAP.values() (list of lists) into a single flat list
    all_skills = [skill for skills in CATEGORY_MAP.values() for skill in skills]

    # Extract skills present in the database or mapped equivalences
    extracted_skills = {word for word in tokens if word in SKILL_EQUIVALENCE or word in all_skills}

    # Normalize synonyms
    extracted_skills = list(set(SKILL_EQUIVALENCE.get(skill, skill) for skill in extracted_skills))

    return extracted_skills if extracted_skills else ["None"]  # Prevent empty skills

In [64]:
def categorize_skills(skill_list):
    """Map extracted skills to broader categories."""
    categorized_skills = {}
    for category, skills in CATEGORY_MAP.items():
        matched = [skill for skill in skill_list if skill in skills]
        if matched:
            categorized_skills[category] = matched
    return categorized_skills if categorized_skills else {"Uncategorized": ["None"]}


In [65]:
def calculate_resume_score(resume_skills, job_skills):
    """Compute similarity score using Jaccard Similarity and TF-IDF Cosine Similarity."""
    resume_skills = list(set(resume_skills))
    job_skills = list(set(job_skills))

    # **Edge Case**: If no job skills are extracted, return score 0
    if job_skills == ["None"]:
        return 0, set()

    # Compute Jaccard Similarity (Overlap Percentage)
    matched_skills = set(resume_skills) & set(job_skills)
    jaccard_similarity = len(matched_skills) / len(set(job_skills)) if job_skills else 0

    # Compute Cosine Similarity (TF-IDF)
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([
            " ".join(resume_skills),
            " ".join(job_skills)
        ])
        cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    except ValueError:
        cosine_sim = 0  # Handle empty vocabulary issue

    # Final Score: Weighted Combination
    final_score = (5 * jaccard_similarity) + (5 * cosine_sim)

    return round(final_score, 2), matched_skills


In [74]:
resume_file = "D:/Prerana/Professional/Prerana-Resume-Recent.pdf"
job_description = """We are looking for a software engineer with experience in Python, Django, and AWS.
                     Knowledge of SQL and cloud infrastructure is a plus."""

# Extract & Preprocess
resume_text = extract_text_from_file(resume_file)
cleaned_jd = preprocess_text(job_description)

# Extract Skills
resume_skills = extract_skills(resume_text)
job_skills = extract_skills(cleaned_jd)

# Categorize Skills
categorized_resume_skills = categorize_skills(resume_skills)
categorized_job_skills = categorize_skills(job_skills)

# Calculate Score
resume_score, matched_skills = calculate_resume_score(resume_skills, job_skills)

# Print Output
print(f"Extracted Resume Skills: {resume_skills}")
print(f"Categorized Resume Skills: {categorized_resume_skills}")
print(f"Extracted Job Skills: {job_skills}")
print(f"Categorized Job Skills: {categorized_job_skills}")
print(f"Matched Skills: {matched_skills}")
print(f"Final Resume Score: {resume_score} / 10")

Extracted Resume Skills: ['flask', 'html', 'javascript', 'python', 'opencv', 'react', 'mongodb', 'excel', 'sql', 'r', 'css', 'java']
Categorized Resume Skills: {'Software Engineering': ['python', 'java'], 'Web Development': ['flask', 'html', 'javascript', 'react', 'css'], 'Database Management': ['mongodb', 'sql'], 'Machine Learning & AI': ['python', 'opencv', 'r'], 'Business Intelligence': ['excel']}
Extracted Job Skills: ['sql', 'python', 'django']
Categorized Job Skills: {'Software Engineering': ['python'], 'Web Development': ['django'], 'Database Management': ['sql'], 'Machine Learning & AI': ['python']}
Matched Skills: {'sql', 'python'}
Final Resume Score: 4.46 / 10


In [75]:
resume_file = "D:/Prerana/Professional/Prerana-Resume-Recent.pdf"
job_description = """web developer with good knowledge in html css javascript flask python or even MERN stack inclusing react node express and mongodb"""

# Extract & Preprocess
resume_text = extract_text_from_file(resume_file)
cleaned_jd = preprocess_text(job_description)

# Extract Skills
resume_skills = extract_skills(resume_text)
job_skills = extract_skills(cleaned_jd)

# Categorize Skills
categorized_resume_skills = categorize_skills(resume_skills)
categorized_job_skills = categorize_skills(job_skills)

# Calculate Score
resume_score, matched_skills = calculate_resume_score(resume_skills, job_skills)

# Print Output
print(f"Extracted Resume Skills: {resume_skills}")
print(f"Categorized Resume Skills: {categorized_resume_skills}")
print(f"Extracted Job Skills: {job_skills}")
print(f"Categorized Job Skills: {categorized_job_skills}")
print(f"Matched Skills: {matched_skills}")
print(f"Final Resume Score: {resume_score} / 10")

Extracted Resume Skills: ['flask', 'html', 'javascript', 'python', 'opencv', 'react', 'mongodb', 'excel', 'sql', 'r', 'css', 'java']
Categorized Resume Skills: {'Software Engineering': ['python', 'java'], 'Web Development': ['flask', 'html', 'javascript', 'react', 'css'], 'Database Management': ['mongodb', 'sql'], 'Machine Learning & AI': ['python', 'opencv', 'r'], 'Business Intelligence': ['excel']}
Extracted Job Skills: ['flask', 'html', 'javascript', 'python', 'react', 'mongodb', 'css']
Categorized Job Skills: {'Software Engineering': ['python'], 'Web Development': ['flask', 'html', 'javascript', 'react', 'css'], 'Database Management': ['mongodb'], 'Machine Learning & AI': ['python']}
Matched Skills: {'flask', 'html', 'javascript', 'python', 'react', 'mongodb', 'css'}
Final Resume Score: 8.43 / 10
