In [1]:
import pdfplumber
import re
import pandas as pd
import os

In [2]:
# Define PDF file path
pdf_path = "pdf/repartition_des_titres_2025.pdf"  # Make sure this file is in your working directory

# Extract text from the PDF
with pdfplumber.open(pdf_path) as pdf:
    raw_text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# Print a small part of the extracted text to check
print(raw_text[:1000])  # Print first 1000 characters

Répartition des titres de capital de la Cote de la Bourse
en compartiments et en groupes de cotation pour l’année 2025
I- Conformément à la décision de la Bourse N° 1/6/2021 sur le compartimentage, telle que
modifiée par décision du Conseil de la Bourse du 02 mai 2023, les émetteurs sont affectés aux
compartiments du Marché Principal comme suit :
(i) Le compartiment A inclut les émetteurs dont la capitalisation boursière est
supérieure ou égale à 200 millions de dinars.
(ii) Le compartiment B inclut les émetteurs dont la capitalisation boursière est
inférieure à 200 millions de dinars.
(iii) Le compartiment S (Sous Surveillance) inclut les émetteurs qui :
a. ne respectent pas leurs obligations en matière d’information financière.
b. sont touchés par des événements susceptibles de perturber durablement
leur situation ou compromettre le bon fonctionnement du marché.
La capitalisation boursière utilisée pour l’affectation aux compartiments A et B est la moyenne
quotidienne des capitalisat

In [3]:
# Split the text into sections based on compartment listings
compartiment_sections = re.split(r"Liste des valeurs composant le compartiment", raw_text)

# Print how many sections were found
print(f"Number of sections found: {len(compartiment_sections)}")

# Print the first 500 characters of the first section to check structure
print(compartiment_sections[1][:500])  # Print first part of section 1

Number of sections found: 4
 « A »
COMPARTIMENT « A »
N° CODE ISIN MNEMO VALEUR GROUPE DE COTATION
1 TN0001100254 SFBT SFBT 11 - Continu
2 TN0001400704 SPDIT SPDIT - SICAF 11 - Continu
3 TN0001600154 TJARI ATTIJARI BANK 11 - Continu
4 TN0001800457 BIAT BIAT 11 - Continu
5 TN0001900604 BH BH BANK 11 - Continu
6 TN0002200053 BT BT 11 - Continu
7 TN0002600955 STB STB 11 - Continu
8 TN0003100609 BNA BNA 11 - Continu
9 TN0003400058 AB AMEN BANK 11 - Continu
10 TN0003600350 ATB ATB 11 - Continu
11 TN0003900107 UIB UIB 11 - Conti


In [4]:
# Loop through the sections and find the compartment type
for i, section in enumerate(compartiment_sections[1:], start=1):  # Skip first (intro) section
    match = re.search(r"« (\w) »", section)
    if match:
        print(f"Section {i}: Found Compartment - {match.group(1)}")
    else:
        print(f"Section {i}: No Compartment Found")

Section 1: Found Compartment - A
Section 2: Found Compartment - B
Section 3: Found Compartment - S


In [5]:
# Print the first 30 lines of the first compartment section to inspect formatting
lines = compartiment_sections[2].split("\n")

print("### First 30 lines of section 1 ###")
for i, line in enumerate(lines[:25]):
    print(f"{i+1}: {line}")

### First 30 lines of section 1 ###
1:  « B »
2: COMPARTIMENT « B »
3: N° CODE ISIN MNEMO VALEUR GROUPE DE COTATION
4: 1 TN0002100907 TLS TUNISIE LEASING F 11 - Continu
5: 2 TN0003200755 ICF ICF 11 - Continu
6: 3 TN0004700100 ATL ATL 11 - Continu
7: 4 TN0006530018 SOTET SOTETEL 11 - Continu
8: 5 TN0006580013 MGR SOTUMAG 11 - Continu
9: 6 TN0006590012 SIAME SIAME 11 - Continu
10: 7 TN0006610018 TJL ATTIJARI LEASING 11 - Continu
11: 8 TN0006660013 STPIL SOTRAPIL 11 - Continu
12: 9 TN0007380017 TRE TUNIS RE 11 - Continu
13: 10 TN0007510019 LNDOR LAND OR 11 - Continu
14: 11 TN0007540016 NBL NEW BODY LINE 11 - Continu
15: 12 TN0007570013 ECYCL EURO-CYCLES 11 - Continu
16: 13 TN0007620016 MPBS MPBS 11 - Continu
17: 14 TN0007630015 STPAP SOTIPAPIER 11 - Continu
18: 15 TN0007740012 SAM ATELIER MEUBLE INT 11 - Continu
19: 16 TNQPQXRODTH8 SMART SMART TUNISIE 11 - Continu
20: 17 TNNGTFLC2986 STA STA 11 - Continu
21: 18 TNDKJ8O68X14 AMV ASSU MAGHREBIA VIE 11 - Continu
22: 19 TN0001000108 MNP MONOP

In [6]:
# FINAL FIX: Separate MNEMO and VALEUR correctly
tables_dict = {}

for i, section in enumerate(compartiment_sections[1:], start=1):
    lines = section.split("\n")

    # Identify compartment type
    match = re.search(r"« (\w) »", lines[0])
    compartment_type = match.group(1) if match else f"Unknown {i}"

    # Extract table rows
    extracted_data = []
    for line in lines:
        parts = line.split()  # Split by whitespace
        
        # Ensure row contains ISIN
        if len(parts) >= 5 and re.match(r"TN[A-Z0-9]+", parts[1]):    
            n = parts[0]  # First column is always the number
            isin = parts[1]  # Second column is always the ISIN
            mnemo = parts[2]  # Third column is always the MNEMO (Ticker)

            # Find where 'GROUPE DE COTATION' starts
            for j in range(3, len(parts)):  # Start from index 3 (after MNEMO)
                if parts[j].isdigit():  # Find first pure number (11, 12, 99, etc.)
                    valeur = " ".join(parts[3:j])  # Everything after MNEMO but before the number
                    group = " ".join(parts[j:])  # Everything after the number
                    extracted_data.append([n, isin, mnemo, valeur, group])
                    break  # Stop at the first valid match

    # Store in dictionary
    if extracted_data:
        tables_dict[compartment_type] = extracted_data

# Print preview
for key, rows in tables_dict.items():
    print(f"\n#### Compartiment {key} ####")
    for row in rows[:20]:  # Print first 5 rows
        print(row)


#### Compartiment A ####
['1', 'TN0001100254', 'SFBT', 'SFBT', '11 - Continu']
['2', 'TN0001400704', 'SPDIT', 'SPDIT - SICAF', '11 - Continu']
['3', 'TN0001600154', 'TJARI', 'ATTIJARI BANK', '11 - Continu']
['4', 'TN0001800457', 'BIAT', 'BIAT', '11 - Continu']
['5', 'TN0001900604', 'BH', 'BH BANK', '11 - Continu']
['6', 'TN0002200053', 'BT', 'BT', '11 - Continu']
['7', 'TN0002600955', 'STB', 'STB', '11 - Continu']
['8', 'TN0003100609', 'BNA', 'BNA', '11 - Continu']
['9', 'TN0003400058', 'AB', 'AMEN BANK', '11 - Continu']
['10', 'TN0003600350', 'ATB', 'ATB', '11 - Continu']
['11', 'TN0003900107', 'UIB', 'UIB', '11 - Continu']
['12', 'TN0005700018', 'PGH', 'POULINA GP HOLDING', '11 - Continu']
['13', 'TN0006560015', 'SOTUV', 'SOTUVER', '11 - Continu']
['14', 'TN0007200017', 'WIFAK', 'WIFACK INT BANK', '11 - Continu']
['15', 'TN0007270010', 'TPR', 'TPR', '11 - Continu']
['16', 'TN0007300015', 'ARTES', 'ARTES', '11 - Continu']
['17', 'TN0007400013', 'CC', 'CARTHAGE CEMENT', '11 - Continu'

In [7]:
# Define the directory where the CSV files will be saved
csv_directory = "csv"

# Ensure the directory exists
os.makedirs(csv_directory, exist_ok=True)

# Convert extracted tables into Pandas DataFrames and save them in the "csv" folder
dataframes = {}

for key, data in tables_dict.items():
    df = pd.DataFrame(data, columns=["N°", "CODE ISIN", "MNEMO", "VALEUR", "GROUPE DE COTATION"])
    dataframes[key] = df  # Store DataFrame

    # Define the file path
    csv_filename = os.path.join(csv_directory, f"repartition_{key}.csv")

    # Save each table as a CSV file inside the "csv" folder
    df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"✅ All CSV files have been saved in the '{csv_directory}' folder.")

✅ All CSV files have been saved in the 'csv' folder.
