In [2]:
import numpy as np
import pandas as pd
import spacy
import pdfplumber 
import nltk
import sklearn
import llama_cpp

print("All libraries are working fine! ✅")
df = pd.read_csv("resume_dataset.csv")
df.head()
df2 = pd.read_csv("Resume.csv")
df2.head()
df2 = df2.drop(columns=["Resume_html"])
df2 = df2.rename(columns={"Resume_str":"Resume"})
df2.head()
final_data = pd.concat([df,df2],ignore_index=True)
final_data = final_data.reset_index(drop=True)
final_data.head()
final_data.info()


All libraries are working fine! ✅
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3703 entries, 0 to 3702
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        3703 non-null   int64 
 1   Category  3703 non-null   object
 2   Resume    3703 non-null   object
dtypes: int64(1), object(2)
memory usage: 86.9+ KB


In [3]:
final_data["Category"].unique()

array(['HR', 'Designing', 'Managment', 'Information Technology',
       'Education', 'Advocate', 'Business Development',
       'Health & Fitness', 'Agricultural', 'BPO', 'Sales', 'Consultant',
       'Digital Media', 'Automobile', 'Food & Beverages', 'Finance',
       'Apparel', 'Engineering', 'Accountant', 'Building & Construction',
       'Architects', 'Public Relations', 'Banking', 'Arts', 'Aviation',
       'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE', 'CHEF',
       'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT', 'CONSTRUCTION',
       'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'], dtype=object)

In [4]:
tech_categories = {"Information Technology", "INFORMATION-TECHNOLOGY", "Engineering", "ENGINEERING"}

# Filter final_data to keep only tech-related resumes
final_data = final_data[final_data["Category"].isin(tech_categories)].reset_index(drop=True)
final_data.head()

Unnamed: 0,ID,Category,Resume
0,167,Information Technology,"b'RESUME\nAJITHA SHENOY .K.B,\nPhD student (Co..."
1,168,Information Technology,b'Mason\t\r \xc2\xa0Silber\t\r \xc2\xa0\n6595\...
2,169,Information Technology,b'Pramod XXXX\nMobile: +91-99********\n\nE-mai...
3,170,Information Technology,"b""Harry M. Rohrer\n3748 Bee Street\nGrand Rapi..."
4,171,Information Technology,"b""Wilson Kunnan Jose\nSr. Consultant, QA\n\nSu..."


In [5]:
#checking missing and duplicated values
print(final_data.isnull().sum())
print(final_data.duplicated().sum())

ID          0
Category    0
Resume      0
dtype: int64
0


In [6]:
import re
def clean_data(data):
    if isinstance(data, bytes):  
        data = data.decode("utf-8", "ignore")  # Convert bytes to string
        
    if not isinstance(data, str):  
        data = str(data)
    
    data = data.replace("\n", " ").replace("\\n", " ")
    data = data.replace("b'", "").replace("b\"", "")
    data = data.lower().strip() #lowercase and stripping whitespaces
    data = re.sub(r'\s+', ' ', data) #extra spaces and newline
    data = re.sub(r'[^a-zA-Z0-9\s]', '', data) #special characters
    return data

final_data["cleaned_resume"] = final_data["Resume"].apply(clean_data)

final_data.head()
    

Unnamed: 0,ID,Category,Resume,cleaned_resume
0,167,Information Technology,"b'RESUME\nAJITHA SHENOY .K.B,\nPhD student (Co...",resume ajitha shenoy kb phd student computer s...
1,168,Information Technology,b'Mason\t\r \xc2\xa0Silber\t\r \xc2\xa0\n6595\...,masontr xc2xa0silbertr xc2xa0 6595tr xc2xa0ler...
2,169,Information Technology,b'Pramod XXXX\nMobile: +91-99********\n\nE-mai...,pramod xxxx mobile 9199 email pramodgmailcom c...
3,170,Information Technology,"b""Harry M. Rohrer\n3748 Bee Street\nGrand Rapi...",harry m rohrer 3748 bee street grand rapids mi...
4,171,Information Technology,"b""Wilson Kunnan Jose\nSr. Consultant, QA\n\nSu...",wilson kunnan jose sr consultant qa summary xe...


In [7]:
import spacy
nlp = spacy.load("en_core_web_sm") #en lang model
#tokenisation
final_data["tokenized_resume"] = final_data["cleaned_resume"].apply(lambda x: [token.text for token in nlp(x)])
print(final_data[["cleaned_resume", "tokenized_resume"]].head())

                                      cleaned_resume  \
0  resume ajitha shenoy kb phd student computer s...   
1  masontr xc2xa0silbertr xc2xa0 6595tr xc2xa0ler...   
2  pramod xxxx mobile 9199 email pramodgmailcom c...   
3  harry m rohrer 3748 bee street grand rapids mi...   
4  wilson kunnan jose sr consultant qa summary xe...   

                                    tokenized_resume  
0  [resume, ajitha, shenoy, kb, phd, student, com...  
1  [masontr, xc2xa0silbertr, xc2xa0, 6595tr, xc2x...  
2  [pramod, xxxx, mobile, 9199, email, pramodgmai...  
3  [harry, m, rohrer, 3748, bee, street, grand, r...  
4  [wilson, kunnan, jose, sr, consultant, qa, sum...  


In [8]:
#stopwords
import nltk 
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
def filter_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

final_data["filtered_tokens"] = final_data["tokenized_resume"].apply(filter_stopwords)
print(final_data[["tokenized_resume", "filtered_tokens"]].head())

                                    tokenized_resume  \
0  [resume, ajitha, shenoy, kb, phd, student, com...   
1  [masontr, xc2xa0silbertr, xc2xa0, 6595tr, xc2x...   
2  [pramod, xxxx, mobile, 9199, email, pramodgmai...   
3  [harry, m, rohrer, 3748, bee, street, grand, r...   
4  [wilson, kunnan, jose, sr, consultant, qa, sum...   

                                     filtered_tokens  
0  [resume, ajitha, shenoy, kb, phd, student, com...  
1  [masontr, xc2xa0silbertr, xc2xa0, 6595tr, xc2x...  
2  [pramod, xxxx, mobile, 9199, email, pramodgmai...  
3  [harry, rohrer, 3748, bee, street, grand, rapi...  
4  [wilson, kunnan, jose, sr, consultant, qa, sum...  


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aarushi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
#defining skill set
skill_set = {
    # Programming Languages
    "python", "java", "javascript", "c", "c++", "c#", "go", "ruby", "typescript", "php", "swift", "kotlin", "r", "perl",
    
    # Web Development
    "html", "css", "sass", "less", "react", "vue", "angular", "next.js", "nuxt.js", "svelte", "jquery",

    # Backend & APIs
    "node.js", "express", "django", "flask", "spring boot", "fastapi", "laravel", "ruby on rails",
    "rest api", "graphql", "grpc", "microservices", "soap",

    # Databases & Storage
    "mysql", "postgresql", "sqlite", "mongodb", "cassandra", "redis", "firebase", "dynamodb", "elasticsearch",
    "neo4j", "oracle", "sql server", "couchdb", "bigquery",

    # DevOps & Cloud
    "docker", "kubernetes", "aws", "azure", "gcp", "terraform", "ansible", "jenkins", "github actions",
    "kafka", "rabbitmq", "nginx", "apache", "cloud functions", "serverless", "ci/cd", "helm", "istio",

    # Data Science & ML
    "machine learning", "deep learning", "pandas", "numpy", "scikit-learn", "tensorflow", "pytorch",
    "opencv", "huggingface", "nltk", "spacy", "transformers", "matplotlib", "seaborn", "xgboost",

    # Mobile Development
    "android", "ios", "flutter", "react native", "swift", "kotlin", "cordova", "xamarin",

    # Cybersecurity
    "penetration testing", "burpsuite", "wireshark", "nmap", "metasploit", "osint", "siem",
    "owasp", "reverse engineering", "malware analysis",

    # Other Technologies & Tools
    "git", "github", "gitlab", "bitbucket", "jira", "confluence", "agile", "scrum", "trello", "kanban",
    "vim", "bash", "powershell", "zsh", "linux", "windows", "macos"
}
degree_set= {"bachelor", "master", "phd", "b.tech", "b.sc", "m.tech", "m.sc", "mba"}

In [10]:
def extract_skills(resume):
    matched_skills = [skill for skill in skill_set if skill in resume] 
    return list(set(matched_skills))  

def extract_experience(resume):
    match = re.search(r'(\d+)\s+years?', resume)  # Look for "X years"
    return int(match.group(1)) if match else 0 

def extract_education(resume):
    words = resume.split()
    found_degrees = [word for word in words if word.lower() in degree_set]
    return found_degrees if found_degrees else ["Not Mentioned"]


final_data["extracted_skills"] = final_data["cleaned_resume"].apply(extract_skills)
final_data["experience_years"] = final_data["cleaned_resume"].apply(extract_experience)
final_data["education"] = final_data["cleaned_resume"].apply(extract_education)


In [11]:
final_data.head()

Unnamed: 0,ID,Category,Resume,cleaned_resume,tokenized_resume,filtered_tokens,extracted_skills,experience_years,education
0,167,Information Technology,"b'RESUME\nAJITHA SHENOY .K.B,\nPhD student (Co...",resume ajitha shenoy kb phd student computer s...,"[resume, ajitha, shenoy, kb, phd, student, com...","[resume, ajitha, shenoy, kb, phd, student, com...","[java, go, linux, r, c, oracle, windows]",4,"[phd, phd]"
1,168,Information Technology,b'Mason\t\r \xc2\xa0Silber\t\r \xc2\xa0\n6595\...,masontr xc2xa0silbertr xc2xa0 6595tr xc2xa0ler...,"[masontr, xc2xa0silbertr, xc2xa0, 6595tr, xc2x...","[masontr, xc2xa0silbertr, xc2xa0, 6595tr, xc2x...","[java, go, python, django, r, ios, c]",0,[Not Mentioned]
2,169,Information Technology,b'Pramod XXXX\nMobile: +91-99********\n\nE-mai...,pramod xxxx mobile 9199 email pramodgmailcom c...,"[pramod, xxxx, mobile, 9199, email, pramodgmai...","[pramod, xxxx, mobile, 9199, email, pramodgmai...","[html, java, go, javascript, css, r, mysql, c,...",5,[Not Mentioned]
3,170,Information Technology,"b""Harry M. Rohrer\n3748 Bee Street\nGrand Rapi...",harry m rohrer 3748 bee street grand rapids mi...,"[harry, m, rohrer, 3748, bee, street, grand, r...","[harry, rohrer, 3748, bee, street, grand, rapi...","[html, php, jquery, css, r, mysql, c, agile]",0,[Not Mentioned]
4,171,Information Technology,"b""Wilson Kunnan Jose\nSr. Consultant, QA\n\nSu...",wilson kunnan jose sr consultant qa summary xe...,"[wilson, kunnan, jose, sr, consultant, qa, sum...","[wilson, kunnan, jose, sr, consultant, qa, sum...","[aws, apache, html, php, java, go, gcp, javasc...",13,[bachelor]


In [12]:
final_data.count()

ID                  463
Category            463
Resume              463
cleaned_resume      463
tokenized_resume    463
filtered_tokens     463
extracted_skills    463
experience_years    463
education           463
dtype: int64

In [14]:
final_data.reset_index(drop=True, inplace=True)
final_data["resume_id"] = final_data.index  

In [16]:
import re

indian_cities = ["Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai", "Kolkata", "Pune", 
                 "Ahmedabad", "Jaipur", "Surat", "Lucknow", "Kanpur", "Nagpur", "Indore", 
                 "Bhopal", "Patna", "Ludhiana", "Agra", "Nashik", "Vadodara", "Meerut",
                 "Varanasi", "Raipur", "Ranchi", "Guwahati", "Chandigarh", "Coimbatore"]

countries = ["India", "USA", "United States", "UK", "United Kingdom", "Canada", "Australia",
             "Germany", "France", "Singapore", "UAE", "Dubai", "Netherlands", "Japan"]

def extract_location(text):
    if not isinstance(text, str) or text.lower() in ["not specified", "nan"]:
        return "Not Specified"
    
    for city in indian_cities:
        if re.search(rf'\b{city}\b', text, re.IGNORECASE):
            return city + ", India"

    for country in countries:
        if re.search(rf'\b{country}\b', text, re.IGNORECASE):
            return country

    return "Not Specified" 

final_data["location"] = final_data["cleaned_resume"].apply(extract_location)

In [17]:
final_data["location"].unique()

array(['Kanpur, India', 'Not Specified', 'Coimbatore, India',
       'United States', 'UK', 'Chennai, India', 'India',
       'Hyderabad, India', 'USA', 'Bangalore, India', 'Canada',
       'Singapore', 'France', 'Mumbai, India', 'Australia',
       'Kolkata, India', 'Delhi, India', 'Netherlands', 'Germany',
       'Japan', 'Pune, India'], dtype=object)

In [18]:
final_data.to_csv("resumes.csv", index=False)  