In [2]:
import PyPDF2
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import docx2txt
from docx import Document

# Download necessary resources from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Function to extract text content from a PDF file
def get_pdf_content(path):
    content = ""
    pdf = PyPDF2.PdfFileReader(open(path, 'rb'))
    for i in range(pdf.getNumPages()):
        content += pdf.getPage(i).extractText() + "\n"
    # Clean and normalize the content
    content = " ".join(content.replace(u"\xa0", " ").strip().split())
    return content

# Function to extract text content from a DOCX file
def get_text_from_docx(filename):
    return docx2txt.process(filename)

# Function to process a resume and extract the parsed information
def process_resume(filename):
    resume = ""
    
    if filename.endswith(".pdf"):
        # Extract text content from the PDF file and convert to string
        resume = get_pdf_content(filename).encode("ascii", "ignore").decode('utf-8')
    elif filename.endswith(".docx"):
        # Extract text content from the DOCX file and convert to string
        resume = get_text_from_docx(filename).encode("ascii", "ignore").decode('utf-8')
    else:
        print("File format is currently not supported")
        exit(0)

    print("Processing...\nPlease wait....")

    # Tokenize the resume into individual words
    tokens = word_tokenize(resume)

    # Define a list of punctuation marks to be removed
    punctuations = ['(', ')', ';', ':', '[', ']', ',']

    # Define a set of stop words
    stop_words = set(stopwords.words('english'))

    # Filter out stop words and punctuation marks from the tokens
    filtered = [w for w in tokens if w.lower() not in stop_words and w not in string.punctuation]
    print("Removing stop words...\nCleaning the resume...\nExtracting Text...")
    print(filtered)

    # Extract the name from the filtered tokens (assuming first name and last name are the first two tokens)
    name = str(filtered[0]) + ' ' + str(filtered[1])
    print("Name:", name)

    # Extract the email address from the resume using regex
    email = ""
    match_email = re.search(r'[\w\.-]+@[\w\.-]+', resume)
    if match_email is not None:
        email = match_email.group(0)
    print("Email:", email)

    # Extract the mobile number from the resume using regex
    mobile = ""
    match_mobile = re.search(r'((?:\(?\+91\)?)?\d{9})', resume)
    if match_mobile is not None:
        mobile = match_mobile.group(0)
    print("Mobile:", mobile)

    # Join the filtered tokens back into a single string representing the parsed resume
    parsed_resume = ' '.join(filtered)
    print("Parsed Resume in plain Text:", parsed_resume)
    r = str(parsed_resume)

    # Generate shingles (sequences of consecutive words) from the filtered tokens
    shingles = []
    make_shingle = list(nltk.ngrams(filtered, 10))
    for s in make_shingle:
        shingles.append(s)

    print("Shingles for the resume:", shingles)

    # Create a new Word document
    document = Document()
    document.add_heading('Parsed Resume', 0)
    document.add_paragraph(f'Name: {name}')
    document.add_paragraph(f'Email: {email}')
    document.add_paragraph(f'Mobile: {mobile}')
    document.add_paragraph(f'Parsed Resume in plain Text:\n{parsed_resume}')
    document.save('readme.docx')

    print("\nParsed information saved to readme.docx")


# Prompt the user to enter the file name or path
filename = input("Enter file name / path: ")

# Process the resume and save the parsed information to the readme.docx file
process_resume(filename)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter file name / path: resume (1).pdf
Processing...
Please wait....
Removing stop words...
Cleaning the resume...
Extracting Text...
['Functional', 'Resume', 'Sample', 'John', 'W.', 'Smith', '2002', 'Front', 'Range', 'W', 'Fort', 'Collins', 'CO', '80525', 'jwsm', 'ith', 'colosta', 'te.edu', 'Career', 'Summary', 'Four', 'years', 'experience', 'early', 'childhood', 'developm', 'ent', 'di', 'verse', 'background', 'care', 'special', 'need', 'child', 'ren', 'adults', 'Adult', 'Care', 'Experience', 'Determ', 'ined', 'work', 'placem', 'e', 'nt', '150', 'special', 'needs', 'adult', 'clients', 'Maintained', 'client', 'databases', 'records', 'Coordinated', 'clien', 'conta', 'c', 'local', 'health', 'care', 'professionals', 'ont', 'hly', 'basis', 'Managed', '25', 'volunteer', 'workers', 'Childcare', 'Experien', 'ce', 'Coordinated', 'service', 'ass', 'gnm', 'e', 'nts', '20', 'part', '-tim', 'e', 'counselors', '100', 'client', 'fam', 'ilies', 'Oversaw', 'daily', 'activity', 'outing', 'planning', '1