In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
import pdfplumber
import pandas as pd

In [2]:
# Step 1: Access the initial page
initial_page_url = "https://www.bvmt.com.tn/fr/compositions/TN0009050287/details"
response = requests.get(initial_page_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Find all links and search for a .pdf link
pdf_link_tag = None

for link in soup.find_all('a', href=True):
    if ".pdf" in link['href']:
        pdf_link_tag = link
        break  # Stop at the first PDF link found

# Step 3: Extract and format the PDF link
if pdf_link_tag:
    pdf_url = pdf_link_tag['href']

    # Ensure the link is complete
    if not pdf_url.startswith("http"):
        pdf_url = "https://www.bvmt.com.tn" + pdf_url

    print(f"✅ Correct PDF Link: {pdf_url}")
else:
    print("❌ No PDF link found on the page.")

✅ Correct PDF Link: https://www.bvmt.com.tn/sites/default/files/indices/sectoriels/composition_tunindex20.pdf


In [3]:
# Define the directory where the PDF will be saved
pdf_directory = "pdf"

# Ensure the directory exists
os.makedirs(pdf_directory, exist_ok=True)

# Define the local filename with the path
pdf_filename = os.path.join(pdf_directory, "composition_tunindex20.pdf")

# Download the PDF
response = requests.get(pdf_url)

# Save the file in Colab's environment
with open(pdf_filename, "wb") as file:
    file.write(response.content)

print(f"✅ PDF downloaded successfully: {pdf_filename}")

✅ PDF downloaded successfully: pdf\composition_tunindex20.pdf


In [4]:
# Charger les fichiers CSV
df_repartition_A = pd.read_csv("csv/repartition_A.csv")
df_repartition_B = pd.read_csv("csv/repartition_B.csv")

# Fusionner les deux fichiers en supprimant les doublons
df_repartition = pd.concat([df_repartition_A, df_repartition_B]).drop_duplicates()

# Filtrer les entreprises en fonction du groupe de cotation
df_continu = df_repartition[df_repartition["GROUPE DE COTATION"].str.contains("Continu", na=False)]
df_fixing = df_repartition[df_repartition["GROUPE DE COTATION"].str.contains("Fixing", na=False)]

# Define the directory where the CSV files will be saved
csv_directory = "csv"

# Ensure the directory exists
os.makedirs(csv_directory, exist_ok=True)

# Define file paths
continu_csv_path = os.path.join(csv_directory, "continu.csv")
fixing_csv_path = os.path.join(csv_directory, "fixing.csv")

# Save the CSV files in the "csv" folder
df_continu.to_csv(continu_csv_path, index=False, encoding="utf-8")
df_fixing.to_csv(fixing_csv_path, index=False, encoding="utf-8")

print(f"✅ CSV files saved in the '{csv_directory}' folder.")

✅ CSV files saved in the 'csv' folder.


In [5]:
def extract_table_with_liquidity(pdf_path, csv_output_path):
    with pdfplumber.open(pdf_path) as pdf:
        tables = []
        
        # Extract tables from all pages
        for page in pdf.pages:
            extracted_tables = page.extract_tables()
            for table in extracted_tables:
                tables.append(table)

    # Flatten and normalize the extracted tables
    if tables:
        all_data = []
        for table in tables:
            for row in table:
                all_data.append(row)
        
        # Convert to DataFrame and clean data
        df = pd.DataFrame(all_data)

        # Remove empty rows
        df = df.dropna(how='all')  

        # Ensure first row is header and clean column names
        df.columns = df.iloc[0]  # Set first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row from data

        # Remove newlines & extra spaces from headers
        df.columns = df.columns.str.replace(r"[\n\r]+", " ", regex=True).str.strip()

        # Remove text before and including '/' (for multi-line headers)
        df.columns = df.columns.str.replace(r".*/\s*", "", regex=True)

        # Debugging: Print extracted columns
        print("Extracted Columns:", df.columns.tolist())

        # **Remove rows that contain non-numeric values in 'NUMBER OF SHARES'**
        df = df[df["NUMBER OF SHARES"].str.replace(" ", "", regex=True).str.isnumeric()]

        # Convert numeric columns correctly
        df["NUMBER OF SHARES"] = df["NUMBER OF SHARES"].str.replace(" ", "", regex=True).astype(float)
        df["FREE FLOAT"] = df["FREE FLOAT"].str.rstrip('%').astype(float) / 100

        # Calculate Liquidity
        df["LIQUIDITY"] = df["NUMBER OF SHARES"] * df["FREE FLOAT"]

        # Save to CSV
        df.to_csv(csv_output_path, index=False, encoding='utf-8')
        print(f"CSV file with Liquidity column saved: {csv_output_path}")

    else:
        print("No tables found in the PDF.")

# **Example usage**
pdf_path = "pdf/composition_tunindex20.pdf"  # Replace with your actual PDF file
# Define the directory where the CSV file will be saved
csv_directory = "csv"

# Ensure the directory exists
os.makedirs(csv_directory, exist_ok=True)

# Define the file path within the "csv" folder
csv_output_path = os.path.join(csv_directory, "composition_tunindex20.csv")
extract_table_with_liquidity(pdf_path, csv_output_path)

Extracted Columns: ['N°', 'ISIN CODE', 'MNEMO', 'COMPANY NAME', 'NUMBER OF SHARES', 'FREE FLOAT', 'CAPPING COEFFICIENT']
CSV file with Liquidity column saved: csv\composition_tunindex20.csv
