<a href="https://colab.research.google.com/github/soukainafarisse16/pdftoexcel/blob/main/PDFTOEXCEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract pdf2image pandas openpyxl

import os
import re
from pdf2image import convert_from_path
from pytesseract import image_to_string
import pandas as pd
from google.colab import files

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print(f"Uploaded file: {pdf_path}")

pages = convert_from_path(pdf_path)
print(f"Number of pages converted: {len(pages)}")

ocr_text = ""
for i, page in enumerate(pages):
    page_text = image_to_string(page)
    ocr_text += f"--- Page {i+1} ---\n" + page_text + "\n"
    print(f"OCR completed for page {i+1}")

print("\n===== OCR Extracted Text Preview (First 20000 characters) =====")
print(ocr_text[:20000])

def parse_candidates(ocr_text):
    candidates = []
    pattern = re.compile(
        r"(?P<name>[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s-\s\d+°\n"
        r"(?P<title>.*?)\n\n"
        r"(?P<location>.*?)(?:\s-\s(?P<industry>.*?))\n\n"
        r"(?P<company_line>.*?)\n?\n"  # Capture the whole line containing company info
    )
    for match in pattern.finditer(ocr_text):
        candidate = match.groupdict()
        company_line = candidate.get('company_line', '')  # Get the company line

        company_match = re.search(r"(?:presso|for|at)\s(.*?)(?:\s\d{4}|$)", company_line)
        if company_match:
            candidate["company"] = company_match.group(1).strip()  # Extract and clean company
        else:
            candidate["company"] = ""  # Or "Not Available" if you prefer

        candidates.append(candidate)
        del candidate['company_line']  # Remove the 'company_line' key
    return candidates

parsed_data = parse_candidates(ocr_text)
print(f"Parsed {len(parsed_data)} candidates")
print(parsed_data[:4])

def save_to_excel(data, output_file):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

output_file = "candidates(9).xlsx"
save_to_excel(parsed_data, output_file)

files.download(output_file)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 18 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 0s (373 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
T

Saving general2-10.pdf to general2-10.pdf
Uploaded file: general2-10.pdf
Number of pages converted: 20
OCR completed for page 1
OCR completed for page 2
OCR completed for page 3
OCR completed for page 4
OCR completed for page 5
OCR completed for page 6
OCR completed for page 7
OCR completed for page 8
OCR completed for page 9
OCR completed for page 10
OCR completed for page 11
OCR completed for page 12
OCR completed for page 13
OCR completed for page 14
OCR completed for page 15
OCR completed for page 16
OCR completed for page 17
OCR completed for page 18
OCR completed for page 19
OCR completed for page 20

===== OCR Extracted Text Preview (First 20000 characters) =====
--- Page 1 ---
ff Recruiter Q @ Aa®e@ (& =
Spotlight m @ Q)
110K+ 30K+ 21K+
Disponibili a lavorare Talenti attivi Candidati riscoperti

() RISULTATI: 280K+ Mostra dettagli ricerca v

€

Esperienza

Istruzione
Interesse

Attivita

« 226-250 >

Massimiliano Granaroli - 2°
Senior Fleet & Asset Manager

Rome, Latium, Italy 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!apt-get install -y poppler-utils tesseract-ocr libtesseract-dev
!pip install streamlit pyngrok pytesseract pdf2image pandas openpyxl


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import re
from pdf2image import convert_from_bytes
from pytesseract import image_to_string
from io import BytesIO

# Ensure Tesseract is installed (for Windows users)
import os
if os.name == 'nt':
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Streamlit UI - Title
st.set_page_config(page_title="PDF to Excel Converter", page_icon="📄", layout="wide")
st.title("📄 AI-Powered PDF to Excel Extractor")

# Function to extract text from PDF
def extract_text_from_pdf(uploaded_file):
    images = convert_from_bytes(uploaded_file.read())
    text = ""
    for i, image in enumerate(images):
        text += image_to_string(image, config="--psm 6") + "\n"
    return text.replace("Mostra tutto", "")

# Function to parse extracted text
def parse_candidates(ocr_text):
    candidates = []

    pattern = re.compile(
        r"(?P<name>[A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)\n"
        r"(?P<title>.+?)\n"
        r"(?P<company>.+?)\n"
        r"(?P<location>[A-Za-zÀ-ÖØ-öø-ÿ\s]+)(?:\s*-\s*(?P<industry>[A-Za-zÀ-ÖØ-öø-ÿ\s]+))?"
    )

    for match in pattern.finditer(ocr_text):
        candidates.append(match.groupdict())

    return candidates

# Sidebar for file upload
st.sidebar.title("📂 Upload PDF File")
st.sidebar.write("Upload a **PDF file** and extract structured **candidate details**.")

uploaded_file = st.sidebar.file_uploader("Choose a PDF", type=["pdf"])

if uploaded_file:
    with st.spinner("⏳ Processing your file... Please wait."):
        extracted_text = extract_text_from_pdf(uploaded_file)
        parsed_data = parse_candidates(extracted_text)

    if parsed_data:
        df = pd.DataFrame(parsed_data)

        # Ensure all required columns exist
        required_columns = ["name", "title", "company", "location", "industry"]
        for col in required_columns:
            if col not in df:
                df[col] = "Not Available"

        # Show data preview
        st.success("✅ Extraction complete! Here's a preview of the data:")
        st.dataframe(df)

        # Save Excel file in memory
        output = BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            df.to_excel(writer, index=False)
            writer.close()

        # Provide Download Button
        st.download_button(
            label="📥 Download Excel File",
            data=output.getvalue(),
            file_name="candidates.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )
    else:
        st.error("⚠️ No candidates found. Try another file.")


Overwriting app.py


In [None]:
from pyngrok import ngrok
import subprocess

# Kill any existing ngrok processes
!pkill -9 ngrok

# Run Streamlit in the background
process = subprocess.Popen(["streamlit", "run", "app.py"])

# Check for existing tunnel (modification to prevent reaching the connection limit)
try:
    tunnels = ngrok.get_tunnels()
    if tunnels:
        url = tunnels[0].public_url  # Use existing tunnel if found
        print(f"🚀 Using existing ngrok tunnel: {url}")
    else:
        url = ngrok.connect(8501, "http") # If not, connect using the given port and protocol
        print(f"🚀 Open your Streamlit app here: {url}")
except Exception as e:
    print(f"Failed to connect to ngrok: {e}")
    # Consider printing the list of existing tunnels using ngrok.get_tunnels() to debug further



🚀 Open your Streamlit app here: NgrokTunnel: "https://6155-34-86-179-223.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pip install pyngrok
!ngrok authtoken 2sfnxLdzn0NroEeyzjYrmbUoZNJ_TFr69vSqRR13xL5mJEJm


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
