In [1]:
!pip install nltk spacy pdfplumber docx2txt
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.1/12.8 MB 550.5 kB/s eta 0:00:24
     --------------------------------------- 0.1/12.8 MB 722.1 kB/s eta 0:00:18
      -------------------------------------- 0.2/12.8 MB 985.7 kB/s eta 0:00:13
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.4/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 1.4 MB/s eta 0:00:09
     - -------------------------------------

In [2]:
import pdfplumber
import docx2txt

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    return docx2txt.process(docx_path)


In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

nltk.download("punkt")
nltk.download("stopwords")

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)   # Remove digits
    text = text.lower()               # Convert to lowercase
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]  # Remove special characters
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return None


In [5]:
def extract_contact_details(text):
    phone = re.findall(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)
    email = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    return {"phone": phone[0] if phone else None, "email": email[0] if email else None}


In [6]:
skills_db = {"python", "java", "machine learning", "deep learning", "nlp", "data science",
             "sql", "tensorflow", "pytorch", "aws", "django", "flask", "hadoop"}

def extract_skills(text):
    tokens = preprocess_text(text)
    found_skills = set()
    for token in tokens:
        if token in skills_db:
            found_skills.add(token)
    return list(found_skills)

In [7]:
def extract_education(text):
    education_patterns = ["bachelor", "master", "phd", "b.sc", "m.sc", "b.tech", "m.tech", "mba"]
    sentences = sent_tokenize(text)
    education_info = [sentence for sentence in sentences if any(deg in sentence.lower() for deg in education_patterns)]
    return education_info


In [8]:
def extract_experience(text):
    exp_patterns = re.findall(r"(\d{4})\s*-\s*(\d{4}|present)", text.lower())
    experiences = []
    for start, end in exp_patterns:
        experiences.append(f"{start} - {end}")
    return experiences


In [9]:
def parse_resume(file_path, file_type="pdf"):
    if file_type == "pdf":
        text = extract_text_from_pdf(file_path)
    else:
        text = extract_text_from_docx(file_path)

    return {
        "name": extract_name(text),
        "contact": extract_contact_details(text),
        "skills": extract_skills(text),
        "education": extract_education(text),
        "experience": extract_experience(text),
    }


In [11]:
resume_data = parse_resume(r"Sample Resumes\1901841_RESUME.pdf", file_type="pdf")
print(resume_data)


{'name': 'ANUVA GOYAL\nst\nD.O.B.', 'contact': {'phone': '9520349542', 'email': 'anuvagoyal111@gmail.com'}, 'skills': ['nlp', 'tensorflow', 'sql', 'python', 'java'], 'education': ['EDUCATION\nQualification Institute CGPA Year of Completion\nB.Tech.', 'SKILLS\n• Experience with C, C++, Python, JAVA, HTML, CSS, JavaScript, Data Structures, SQL\n• Software’s: PyCharm, Jupyter Notebook, Google Colab, Code Blocks, MATLAB, Turbo C++, MS Office\n• Machine Learning Frameworks: Scikit-Learn, TensorFlow, OpenCV, NumPy, Pytesseract, Keras\nACHIEVEMENTS AND CERTIFICATIONS\n• Secured 3rd position in TECH-A-THON organized by The ECE Society, BIT Mesra, Ranchi (Oct 2021)\n• Participant of 30 Days of Google Cloud (Sep 2021 - Oct 2021)\n• Won 1st prize in the online competition Game of Brands organized by SGGSCC, University of Delhi (Mar 2021)\n• Secured 2nd rank in Street Play, an Intra Faculty Competition on theme “Women Empowerment” (Sep 2021)\nWORKSHOPS AND EVENTS\n• Finalist of Rise in Crisis, a c