In [1]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF for PDF processing
import csv
import os
import fitz  # PyMuPDF

In [2]:
# ----------------- CONFIGURATION -----------------
BASE_URL = "https://www.tunisievaleurs.com/"
PDF_DIR = "pdf"
TXT_DIR = "txt"
CSV_DIR = "csv"
PDF_FILENAME = os.path.join(PDF_DIR, "Stock_Guide_Marche_Principal.pdf")
TXT_FILE = os.path.join(TXT_DIR, "stockgfr.txt")
NAMES_FILE = os.path.join(TXT_DIR, "names.txt")
ALL_FILE = os.path.join(TXT_DIR, "all.txt")
SECTORS_FILE = os.path.join(TXT_DIR, "sectors.txt")
CSV_FILE = os.path.join(CSV_DIR, "companies_sectors.csv")

# Ensure directories exist
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(TXT_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)

In [12]:
# ----------------- STEP 1: DOWNLOAD PDF -----------------
def get_pdf_link():
    """Finds the PDF link for 'Stock Guide du Marché Principal'."""
    response = requests.get(BASE_URL)
    soup = BeautifulSoup(response.text, 'html.parser')

    for link in soup.find_all('a', href=True):
        if "Stock Guide du Marché Principal" in link.text:
            pdf_link = link['href']
            return pdf_link if pdf_link.startswith("http") else BASE_URL + pdf_link
    return None

def download_pdf(pdf_url):
    """Downloads the PDF and saves it."""
    response = requests.get(pdf_url)
    with open(PDF_FILENAME, "wb") as file:
        file.write(response.content)
    print(f"✅ PDF downloaded successfully: {PDF_FILENAME}")

In [4]:
# Define the rectangle (bounding box) in PDF points
x_min, x_max = 36.86, 45.70
y_min, y_max = 100.78, 784.80

def extract_text_from_pdf():
    """Extracts text from a specific region in the PDF and saves it as a TXT file."""
    doc = fitz.open(PDF_FILENAME)
    extracted_text = []

    for page in doc:
        blocks = page.get_text("blocks")  # Extract text as blocks

        for block in blocks:
            x0, y0, x1, y1, text, *_ = block  # Unpack the block data

            # Check if the text block is within the defined rectangle
            if (x_min <= x0 <= x_max) and (y_min <= y0 <= y_max):
                extracted_text.append(text.strip())  # Strip to clean empty lines

    # Save extracted text to file
    with open(TXT_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(extracted_text))

    print(f"✅ Text extracted and saved: {TXT_FILE}")

In [5]:
# ----------------- STEP 3: EXTRACT SECTOR NAMES -----------------
def extract_sectors():
    """Extracts sector names based on specific patterns and saves them."""
    with open(TXT_FILE, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sectors = []
    for i in range(len(lines) - 1):
        current_line = lines[i].strip()
        next_line = lines[i + 1].strip()

        if (
            current_line 
            and not any(char.isdigit() for char in current_line) 
            and next_line 
            and any(char.isdigit() for char in next_line) 
            and any(char.isupper() for char in current_line)
        ):
            sectors.append(current_line)

    with open(NAMES_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(sectors))

    print(f"✅ Sectors extracted and saved: {NAMES_FILE}")

In [6]:
# ----------------- STEP 4: FILTER LINES WITH CAPITAL LETTERS -----------------
def filter_lines_with_capitals():
    """Filters lines that contain at least one capital letter and saves them."""
    with open(TXT_FILE, "r", encoding="utf-8") as f:
        lines = [line for line in f if any(char.isupper() for char in line)]

    with open(ALL_FILE, "w", encoding="utf-8") as f:
        f.writelines(lines)

    print(f"✅ Filtered text saved: {ALL_FILE}")

In [7]:
# ----------------- STEP 5: EXTRACT COMPANY NAMES -----------------
def extract_company_names():
    """Extracts company names by removing sector names from the list."""
    with open(ALL_FILE, "r", encoding="utf-8") as f1, open(NAMES_FILE, "r", encoding="utf-8") as f2:
        file1_lines = set(line.strip() for line in f1.readlines())
        file2_lines = set(line.strip() for line in f2.readlines())

    unique_lines = file1_lines - file2_lines  # Companies = all - sectors

    with open(SECTORS_FILE, "w", encoding="utf-8") as f_out:
        f_out.write("\n".join(unique_lines))

    print(f"✅ Unique companies extracted and saved: {SECTORS_FILE}")

In [8]:
# ----------------- STEP 6: CREATE CSV FILE -----------------
def create_csv():
    """Creates a CSV file mapping companies to sectors."""
    with open(SECTORS_FILE, "r", encoding="utf-8") as f:
        sector_names = set(line.strip() for line in f if line.strip())

    company_sector_mapping = {}

    with open(ALL_FILE, "r", encoding="utf-8") as f:
        current_sector = None

        for line in f:
            line = line.strip()

            if line in sector_names:
                current_sector = line
            elif line and current_sector:
                company_sector_mapping[line] = current_sector

    with open(CSV_FILE, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["VALEUR", "SECTOR"])

        for company, sector in company_sector_mapping.items():
            writer.writerow([company, sector])

    print(f"✅ CSV file created: {CSV_FILE}")

In [9]:
# ----------------- MAIN EXECUTION -----------------
def main():
    pdf_url = get_pdf_link()
    if pdf_url:
        download_pdf(pdf_url)
        extract_text_from_pdf()
        extract_sectors()
        filter_lines_with_capitals()
        extract_company_names()
        create_csv()
        print("🎉 Process completed successfully!")
    else:
        print("❌ No PDF link found.")

In [13]:
if __name__ == "__main__":
    main()

✅ PDF downloaded successfully: pdf\Stock_Guide_Marche_Principal.pdf
✅ Text extracted and saved: txt\stockgfr.txt
✅ Sectors extracted and saved: txt\names.txt
✅ Filtered text saved: txt\all.txt
✅ Unique companies extracted and saved: txt\sectors.txt
✅ CSV file created: csv\companies_sectors.csv
🎉 Process completed successfully!
