In [8]:
pip install textract

Note: you may need to restart the kernel to use updated packages.Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl.metadata (16 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl.metadata (4.1 kB)
Collecting docx2txt~=0.8 (from textract)
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting extract-msg<=0.29.* (from textract)
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting pdfminer.six==20191110 (from textract)
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting python-pptx~=0.6.18 (from textract)
  Downloading python_pptx-0.6.23-py3-none-any.whl.metadata (18 kB)
Collecting six~=1.12.0 (from textract)
  Downloading six-1.12.0-py2.py

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
py2neo 2021.2.4 requires six>=1.15.0, but you have six 1.12.0 which is incompatible.

[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import re
import PyPDF2
import textract
import streamlit as st
import xlwt

def extract_info_from_pdf(pdf_path):
    text = ''
    email = ''
    contact = ''

    # Open PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)

        # Extract text from each page
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    # Extract email using regex
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email_match = re.search(email_regex, text)
    if email_match:
        email = email_match.group(0)

    # Extract contact number using regex
    contact_regex = r'\b\d{10}\b'
    contact_match = re.search(contact_regex, text)
    if contact_match:
        contact = contact_match.group(0)

    return email, contact, text

def extract_info_from_doc(doc_path):
    text = textract.process(doc_path).decode('utf-8')
    email = ''
    contact = ''

    # Extract email using regex
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email_match = re.search(email_regex, text)
    if email_match:
        email = email_match.group(0)

    # Extract contact number using regex
    contact_regex = r'\b\d{10}\b'
    contact_match = re.search(contact_regex, text)
    if contact_match:
        contact = contact_match.group(0)

    return email, contact, text

def extract_info_from_cv(cv_path):
    _, ext = os.path.splitext(cv_path)
    if ext == '.pdf':
        return extract_info_from_pdf(cv_path)
    elif ext == '.docx':
        return extract_info_from_doc(cv_path)
    else:
        print(f"Unsupported file format: {ext}")
        return '', '', ''

def save_to_excel(data, output_path):
    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet('CV Data')

    # Headers
    sheet.write(0, 0, 'Email')
    sheet.write(0, 1, 'Contact')
    sheet.write(0, 2, 'Text')

    # Data
    for row, (email, contact, text) in enumerate(data, start=1):
        sheet.write(row, 0, email)
        sheet.write(row, 1, contact)
        sheet.write(row, 2, text)

    workbook.save(output_path)

def main():
    st.title("CV Parser")

    uploaded_files = st.file_uploader("Upload CVs", accept_multiple_files=True, type=['pdf', 'docx'])

    if st.button("Parse CVs"):
        data = []

        # Iterate over uploaded files
        for uploaded_file in uploaded_files:
            cv_content = uploaded_file.getvalue()
            cv_path = f"./temp/{uploaded_file.name}"
            with open(cv_path, 'wb') as f:
                f.write(cv_content)

            email, contact, text = extract_info_from_cv(cv_path)
            data.append((email, contact, text))

        # Save extracted data to Excel
        save_to_excel(data, 'cv_data.xls')
        st.success("CVs parsed successfully. Download the Excel file below.")
        st.download_button(label="Download CV Data", data=open("cv_data.xls", "rb").read(), file_name="cv_data.xls", mime="application/vnd.ms-excel")

if __name__ == "__main__":
    main()

Unsupported file format: .docx
Unsupported file format: .docx
Skipped file heemSen.doc as it is not in DOCX or PDF format.
Skipped file ManrajMeena.doc as it is not in DOCX or PDF format.
Skipped file MINTUKMUAR.doc as it is not in DOCX or PDF format.
Skipped file NavinShakti.doc as it is not in DOCX or PDF format.
Skipped file RamanKumar.doc as it is not in DOCX or PDF format.
Skipped file RohitBhatt.doc as it is not in DOCX or PDF format.
Skipped file Satyadev.doc as it is not in DOCX or PDF format.
Skipped file VijayKumarS.doc as it is not in DOCX or PDF format.
