In [None]:
!pip install nltk spacy scikit-learn sentence-transformers python-docx PyMuPDF --quiet

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

import spacy
spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<spacy.lang.en.English at 0x7e44662bb1d0>

In [None]:
# NLP & ML
import nltk
import spacy
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# PDF & DOCX
import fitz  # PyMuPDF
from docx import Document

# Utility
import numpy as np
import pandas as pd
import re
import os

In [None]:
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [None]:
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

In [None]:
def extract_resume_text(file_path):
    if file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type. Please use .pdf or .docx")

In [None]:
from google.colab import files
uploaded = files.upload()

Saving CV update_Test.pdf to CV update_Test.pdf


In [None]:
resume_text = extract_resume_text("CV update_Test.pdf")
print(resume_text)

Lydia Sharon James 
E-Mail : lydiasharon2907@gmail.com 
Contact : 8297995761 
AI and ML specialist with a Master's degree from IIIT Bangalore, proficient in developing innovative 
solutions and deriving actionable insights. Passionate about optimizing communication and solving challenges 
using AI and ML techniques. Skilled in data analysis, predictive modeling, and statistical methodologies, 
committed to continual advancement and exceeding performance benchmarks in the dynamic field of data 
science.
Experience 
Infosys Private Limited, Hyderabad 
Systems Engineer 
Dec 2021 – Present
●
Proficient in Dataiku and MicroStrategy, optimizing data workflows and enhancing data quality 
through advanced data modeling, warehousing, and building robust ETL pipelines.
●
Spearheaded data engineering and prompt engineering solutions as part of the "Ask Ralph" initiative 
for Ralph Lauren, streamlining reporting and providing actionable data insights.
●
Developed and managed scalable data pipeline

# **Data Cleaning**

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces, tabs, newlines
    text = text.strip()
    return text.lower()

In [None]:
cleaned_resume_text = clean_text(resume_text)
print(cleaned_resume_text[:500])



# **Extract Skills**

In [None]:
skill_keywords = [
    'python', 'java', 'javascript', 'typescript', 'kotlin', 'c++', 'c#', 'go', 'ruby', 'php',
    'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django', 'flask', 'spring boot',
    'git', 'github', 'bitbucket', 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'ec2', 's3',
    'mysql', 'postgresql', 'sqlite', 'mongodb', 'redis', 'elasticsearch',
    'tensorflow', 'keras', 'pytorch', 'numpy', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn',
    'jira', 'confluence', 'agile', 'scrum', 'rest api', 'graphql', 'ci/cd', 'linux', 'bash', 'mlops'
]

In [None]:
def extract_skills(text):
    found_skills = []
    for skill in skill_keywords:
        if skill in text:
            found_skills.append(skill)
    return list(set(found_skills))

In [None]:
skills = extract_skills(cleaned_resume_text)
print("Skills:", skills)

Skills: ['python', 'go', 'scrum', 'agile', 'javascript', 'html', 'git', 'java', 'c++']


# **Extract Education Details**

In [None]:
import re

# Define a list of common education-related keywords
education_keywords = ['b.tech', 'bsc', 'msc', 'mca', 'mba', 'bachelor', 'degree', 'master', 'university', 'institute', 'college']

def extract_education(text):
    # Find all occurrences of education-related keywords and extract surrounding details
    education_info = []
    for keyword in education_keywords:
        if keyword in text:
            # Find text related to the keyword, and assume it's the education information
            matches = re.findall(r'([A-Za-z0-9\s,\.]+(?:' + re.escape(keyword) + r'[A-Za-z0-9\s,\.]*)+)', text)
            education_info.extend(matches)
    return education_info

In [None]:
education_details = extract_education(cleaned_resume_text)
print("Education:", education_details)

Education: [' worked as an outreach intern for the mission swavalamban project, focusing on market research, communications, and outreach. ', ' pg college, hyderabad bachelors of computer science 2021 90', 's degree from iiit bangalore, proficient in developing innovative solutions and deriving actionable insights. passionate about optimizing communication and solving challenges using ai and ml techniques. skilled in data analysis, predictive modeling, and statistical methodologies, committed to continual advancement and exceeding performance benchmarks in the dynamic field of data science. experience infosys private limited, hyderabad systems engineer dec 2021 ', ' types and history st. pious x degree ', ' st. pious x degree ', ' ai and ml specialist with a master', ' masters in artificial intelligence and machine learning 2024 90', ' automatic ticket classification indian institute of information technology, bangalore sept 2023 ', ' grouped complaints into categories for efficient ha

# **Extract Work Experience**

In [None]:
def extract_experience(text):
    # Define common work-related keywords
    experience_keywords = ['experience', 'worked as', 'role', 'intern', 'company', 'position', 'responsibilities']

    experience_info = []
    for keyword in experience_keywords:
        if keyword in text:
            # Find the surrounding text for job-related keywords
            matches = re.findall(r'([A-Za-z0-9\s,\.]+(?:' + re.escape(keyword) + r'[A-Za-z0-9\s,\.]*)+)', text)
            experience_info.extend(matches)
    return experience_info


In [None]:
experience_details = extract_experience(cleaned_resume_text)
print("Work Experience:", experience_details)

Work Experience: ['s degree from iiit bangalore, proficient in developing innovative solutions and deriving actionable insights. passionate about optimizing communication and solving challenges using ai and ml techniques. skilled in data analysis, predictive modeling, and statistical methodologies, committed to continual advancement and exceeding performance benchmarks in the dynamic field of data science. experience infosys private limited, hyderabad systems engineer dec 2021 ', ' worked as an outreach intern for the mission swavalamban project, focusing on market research, communications, and outreach. ', 'quality data for business analysts and data scientists. letsendorse, bengaluru intern oct 2020 ', ' worked as an outreach intern for the mission swavalamban project, focusing on market research, communications, and outreach. ', ' collaborated to assess feasibility and alignment with company goals. cryptography']


# **Extract Projects**

In [None]:
def extract_projects(text):
    # Define common project-related keywords
    project_keywords = ['project', 'academic project', 'personal project', 'worked on', 'built', 'developed']

    project_info = []
    for keyword in project_keywords:
        if keyword in text:
            # Find the surrounding text for project-related keywords
            matches = re.findall(r'([A-Za-z0-9\s,\.]+(?:' + re.escape(keyword) + r'[A-Za-z0-9\s,\.]*)+)', text)
            project_info.extend(matches)
    return project_info

In [None]:
project_details = extract_projects(cleaned_resume_text)
print("Projects:", project_details)

Projects: [' worked as an outreach intern for the mission swavalamban project, focusing on market research, communications, and outreach. ', ' exhibited independence and a willingness to learn while delivering on assigned tasks. 2 projects', ' developed a project to organize customer complaints based on products', ' developed and managed scalable data pipelines, utilizing sql, python, and spark to support automated reporting and ensure efficient data integration across large', ' developed a project to organize customer complaints based on products']


# **Match Resume with Job Description**

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model from Sentence-Transformers
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings for text
def get_embeddings(text):
    return model.encode([text])[0]  # Get the embedding for the text

# Example resume and job description (replace these with your actual data)
resume_text = cleaned_resume_text
job_description_text = """
We are looking for a Backend Developer with strong experience in Python, Django, AWS, and MySQL.
You should have a background in building scalable applications, working with APIs, and deploying services on the cloud.
"""

# Get embeddings for both the resume and the job description
resume_embedding = get_embeddings(resume_text)
job_description_embedding = get_embeddings(job_description_text)

# Calculate cosine similarity between the resume and job description
similarity_score = cosine_similarity([resume_embedding], [job_description_embedding])[0][0]

# Print the similarity score
print(f"Cosine Similarity between resume and job description: {similarity_score:.2f}")


Cosine Similarity between resume and job description: 0.33


# **Model Evaluation**

***Create the Dataset***

In [None]:
import pandas as pd

# Sample dataset of resumes and job titles
data = {
    'Resume': [
        "Experienced Python developer with strong skills in web development using Django, Flask, and FastAPI. Hands-on experience with RESTful API design, Docker, PostgreSQL, and AWS services. Worked on several projects in machine learning using TensorFlow and Scikit-learn.",
        "Skilled Backend Developer specializing in Java, Spring Boot, and Hibernate. Experience with building scalable applications, using Kafka, RabbitMQ for messaging, and deploying applications on AWS. Familiar with SQL and NoSQL databases.",
        "Highly motivated front-end developer with expertise in JavaScript, React, HTML, CSS, and Vue.js. Experience in building responsive UIs and integrating RESTful APIs. Knowledge of state management using Redux and MobX.",
        "Data Scientist with a passion for machine learning and artificial intelligence. Experienced with Python libraries such as Pandas, NumPy, Scikit-learn, TensorFlow, and PyTorch. Worked on projects involving predictive modeling, data analysis, and recommendation systems.",
        "Full Stack Developer with experience in both front-end and back-end technologies. Proficient in JavaScript (React, Node.js), SQL databases, and RESTful APIs. Worked on large-scale web applications and microservices architecture.",
        "A skilled DevOps Engineer with hands-on experience in setting up CI/CD pipelines, Docker, Kubernetes, AWS, and Jenkins. Automated infrastructure and deployment processes, improving overall efficiency.",
        "Experienced Project Manager with knowledge in Agile and Scrum methodologies. Worked with cross-functional teams to deliver projects on time, managing the project lifecycle, including requirements gathering, planning, execution, and monitoring.",
        "Quality Assurance Engineer with expertise in manual and automated testing. Proficient in using Selenium, JUnit, and TestNG for test automation. Experience in working in Agile teams and ensuring quality standards.",
        "Cybersecurity Specialist with expertise in threat analysis, network security, and vulnerability management. Skilled in using tools like Wireshark, Nessus, and Metasploit to secure systems and perform penetration testing.",
        "Business Analyst with experience in requirement gathering, process modeling, and data analysis. Skilled in using tools like Microsoft Excel, Tableau, SQL, and Power BI to create insightful reports.",
        "Cloud Engineer with expertise in cloud platforms like AWS, Azure, and Google Cloud. Experience in building and managing cloud infrastructure, automation of cloud resources, and optimizing performance and cost.",
        "Mobile App Developer with proficiency in building Android and iOS applications. Strong in Java, Kotlin, Swift, and React Native. Experience in working with Firebase, SQLite, and implementing app functionality.",
        "Network Engineer with experience in designing, implementing, and managing networks. Expertise in routing protocols, firewall configurations, VPNs, and network monitoring tools like Wireshark and SolarWinds.",
        "Machine Learning Engineer with experience in developing, deploying, and maintaining machine learning models. Skilled in Python, TensorFlow, Keras, PyTorch, and NLP techniques for text analysis.",
        "UI/UX Designer with expertise in creating user-centered designs for mobile and web apps. Proficient in using design tools such as Figma, Sketch, Adobe XD, and conducting user research and usability testing.",
        "Salesforce Developer with strong knowledge of Apex, Visualforce, Lightning components, and Salesforce APIs. Experience in building and deploying custom Salesforce applications and integrations.",
        "Systems Analyst with experience in analyzing and designing software solutions. Proficient in working with business stakeholders, creating requirements documents, and translating them into technical specifications.",
        "Embedded Systems Engineer with experience in designing and developing embedded software for microcontrollers and real-time systems. Skilled in C, C++, and Python programming languages, and familiar with IoT development.",
        "Operations Manager with experience in overseeing business operations and ensuring efficient workflow. Expertise in process optimization, logistics, and managing cross-functional teams.",
        "Digital Marketing Specialist with expertise in SEO, SEM, content marketing, and social media strategies. Proficient in using tools like Google Analytics, SEMrush, and Hootsuite to enhance brand visibility.",
        "HR Manager with experience in recruitment, performance management, and employee relations. Skilled in using HR software like BambooHR, Workday, and in managing the full employee lifecycle."
    ],
    'Job_Role': [
        'Software Engineer',
        'Backend Developer',
        'Frontend Developer',
        'Data Scientist',
        'Full Stack Developer',
        'DevOps Engineer',
        'Project Manager',
        'QA Engineer',
        'Cybersecurity Specialist',
        'Business Analyst',
        'Cloud Engineer',
        'Mobile App Developer',
        'Network Engineer',
        'Machine Learning Engineer',
        'UI/UX Designer',
        'Salesforce Developer',
        'Systems Analyst',
        'Embedded Systems Engineer',
        'Operations Manager',
        'Digital Marketing Specialist',
        'HR Manager'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Show the dataset
print(df)


                                               Resume  \
0   Experienced Python developer with strong skill...   
1   Skilled Backend Developer specializing in Java...   
2   Highly motivated front-end developer with expe...   
3   Data Scientist with a passion for machine lear...   
4   Full Stack Developer with experience in both f...   
5   A skilled DevOps Engineer with hands-on experi...   
6   Experienced Project Manager with knowledge in ...   
7   Quality Assurance Engineer with expertise in m...   
8   Cybersecurity Specialist with expertise in thr...   
9   Business Analyst with experience in requiremen...   
10  Cloud Engineer with expertise in cloud platfor...   
11  Mobile App Developer with proficiency in build...   
12  Network Engineer with experience in designing,...   
13  Machine Learning Engineer with experience in d...   
14  UI/UX Designer with expertise in creating user...   
15  Salesforce Developer with strong knowledge of ...   
16  Systems Analyst with experi

***Get the Cleaned Resume***

In [None]:
import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv('UpdatedResumeDataSet.csv')

# Show the DataFrame
print(df)

         Category                                             Resume
0    Data Science  Skills * Programming Languages: Python (pandas...
1    Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2    Data Science  Areas of Interest Deep Learning, Control Syste...
3    Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4    Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...
..            ...                                                ...
957       Testing  Computer Skills: â¢ Proficient in MS office (...
958       Testing  â Willingness to accept the challenges. â ...
959       Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...
960       Testing  COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
961       Testing  Skill Set OS Windows XP/7/8/8.1/10 Database MY...

[962 rows x 2 columns]


***Train a Classification Model***

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming you've already loaded the CSV data into the DataFrame 'df'

# Drop rows where 'Category' or 'Resume' is missing
df = df.dropna(subset=['Category', 'Resume'])

# Clean the Resume text (for simplicity, convert to lowercase)
df['Cleaned_Resume'] = df['Resume'].apply(lambda x: x.lower())

# Map job titles to numeric labels for classification (update as per your job roles)
job_title_map = {
    'Software Engineer': 0,
    'Backend Developer': 1,
    'Frontend Developer': 2,
    'Data Scientist': 3,
    'Full Stack Developer': 4,
    'DevOps Engineer': 5,
    'Project Manager': 6,
    'QA Engineer': 7,
    'Cybersecurity Specialist': 8,
    'Business Analyst': 9,
    'Cloud Engineer': 10,
    'Mobile App Developer': 11,
    'Network Engineer': 12,
    'Machine Learning Engineer': 13,
    'UI/UX Designer': 14,
    'Salesforce Developer': 15,
    'Systems Analyst': 16,
    'Embedded Systems Engineer': 17,
    'Operations Manager': 18,
    'Digital Marketing Specialist': 19,
    'HR Manager': 20
}

# Map job titles to numeric labels
df['Job_Label'] = df['Category'].map(job_title_map)

# Drop any rows where the mapping failed (i.e., job title not found in the map)
df = df.dropna(subset=['Job_Label'])

# Split the data into training and testing sets
X = df['Cleaned_Resume']
y = df['Job_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         5.0       1.00      1.00      1.00        15
         9.0       1.00      1.00      1.00        10
        18.0       1.00      1.00      1.00        12

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37



In [None]:
print(df.columns)

Index(['Category', 'Resume', 'Cleaned_Resume', 'Job_Label'], dtype='object')


In [None]:
print(df.head())

             Category                                             Resume  \
403  Business Analyst  Education Details \r\n BE Computer Science Mum...   
404  Business Analyst  Technical Skills Application Servers: IIS 6.0,...   
405  Business Analyst  Key Skills - Requirement Gathering - Requireme...   
406  Business Analyst  IT Skills: Area Exposure Modeling Tool: Bizagi...   
407  Business Analyst  TECHNOLOGICAL SKILLS â¦ Knowledge of Computer...   

                                        Cleaned_Resume  Job_Label  
403  education details \r\n be computer science mum...        9.0  
404  technical skills application servers: iis 6.0,...        9.0  
405  key skills - requirement gathering - requireme...        9.0  
406  it skills: area exposure modeling tool: bizagi...        9.0  
407  technological skills â¦ knowledge of computer...        9.0  


In [None]:
# Check the shape and sparsity of the TF-IDF matrix
print("TF-IDF Train Matrix Shape:", X_train_tfidf.shape)
print("Non-zero elements in TF-IDF Train Matrix:", X_train_tfidf.nnz)

TF-IDF Train Matrix Shape: (86, 2693)
Non-zero elements in TF-IDF Train Matrix: 28986


In [None]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         5.0       1.00      1.00      1.00        15
         9.0       1.00      1.00      1.00        10
        18.0       1.00      1.00      1.00        12

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37



# Extracting Skills, Education, Experience (NER & Keywords)

In [None]:
import spacy
import re

# Load the spaCy model once at the top (you only need to do this once)
nlp = spacy.load("en_core_web_sm")

# Let's assume you have a variable called 'resume_text' (your resume content)
doc = nlp(resume_text)

# Named Entities (Optional: see all entities)
for ent in doc.ents:
    print(ent.text, ent.label_)

# Extract skills (custom list)
skills_list = ['Python', 'Machine Learning', 'Data Analysis', 'Java', 'SQL', 'Deep Learning']
extracted_skills = [token.text for token in doc if token.text in skills_list]
print("Skills:", extracted_skills)

# Extract education (look for degrees)
education_keywords = ['B.Tech', 'Bachelor', 'Master', 'M.Tech', 'PhD', 'Degree']
extracted_education = [sent.text.strip() for sent in doc.sents if any(edu in sent.text for edu in education_keywords)]
print("Education:", extracted_education)

# Extract experience (simple pattern)
experience_pattern = re.compile(r'(\d+)\s+years? of experience')
experience = experience_pattern.findall(resume_text)
print("Experience (years):", experience)

lydia sharon james e-mail PERSON
8297995761 CARDINAL
2021 DATE
ralph PERSON
ralph lauren PERSON
bengaluru ORG
oct 2020 - nov 2020 DATE
2 CARDINAL
classification indian institute of information technology ORG
sept 2023 DATE
recognition indian institute of information technology ORG
june 2022 DATE
degree & pg college for ORG
june 2020 DATE
java PERSON
c++, html PERSON
english LANGUAGE
3 CARDINAL
sharon james PERSON
nigel casper james PERSON
birth - 29/07/2000 NORP
lydia sharon james PERSON
year DATE
indian institute of information technology ORG
2024 90% PERCENT
degree & pg college ORG
2021 DATE
90% PERCENT
iiyr NORP
2018 DATE
70% PERCENT
10th ORDINAL
2016 85% PERCENT
Skills: []
Education: []
Experience (years): []


# Ranking Multiple Job Matches

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Assume you already have resume_text and a list of job_descriptions
job_descriptions = [
    "We need a Python developer with experience in Machine Learning and Data Analysis.",
    "Looking for a Java backend engineer with SQL skills.",
    "Data Scientist with Python, Deep Learning, and NLP experience.",
    "Need a content writer",
    "Looking for an expeirenced grapihc designer",
    "Looking for a social media manager",
    "looking for a junior data analyst",
]

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the resume
resume_embedding = model.encode(resume_text)

# Encode all jobs
job_embeddings = model.encode(job_descriptions)

# Compute similarity
similarity_scores = cosine_similarity([resume_embedding], job_embeddings)[0]

# Rank them
ranked_jobs = sorted(zip(job_descriptions, similarity_scores), key=lambda x: x[1], reverse=True)

print("Top Job Matches:")
for idx, (job, score) in enumerate(ranked_jobs):
    print(f"{idx+1}. Score: {score:.4f} | Job: {job}")


Top Job Matches:
1. Score: 0.5172 | Job: looking for a junior data analyst
2. Score: 0.4794 | Job: We need a Python developer with experience in Machine Learning and Data Analysis.
3. Score: 0.4350 | Job: Data Scientist with Python, Deep Learning, and NLP experience.
4. Score: 0.4272 | Job: Looking for a Java backend engineer with SQL skills.
5. Score: 0.2937 | Job: Looking for an expeirenced grapihc designer
6. Score: 0.2246 | Job: Looking for a social media manager
7. Score: 0.2186 | Job: Need a content writer
