In [1]:
import os
import pandas as pd

In [2]:
import time
import re

folder_path = "E:/Codes/Python/indexfundproject/work/data"

# Regex patterns to match any KSE 100 or KSE 30 variant
patterns_to_keep = [
    re.compile(r"kse[\s\-_–—]*100", re.IGNORECASE),
    re.compile(r"kse[\s\-_–—]*30", re.IGNORECASE),
]


def read_excel_any_engine(file_path):
    """Try multiple engines until one works."""
    engines = ["openpyxl", "xlrd", "odf"]
    for engine in engines:
        try:
            return pd.ExcelFile(file_path, engine=engine)
        except Exception:
            continue
    # Last resort: some .xls files are actually HTML tables
    try:
        df_list = pd.read_html(file_path)
        if df_list:
            return {"HTML": df_list[0]}
    except Exception:
        pass
    raise ValueError("Unable to open file with any supported engine")


def delete_original_if_different(original_path: str, cleaned_path: str, filename_for_log: str) -> None:
    """Delete the original file if it's a different path than the cleaned output.
    Retries a few times in case another process still has the file open (e.g., Excel or preview pane)."""
    try:
        if os.path.normcase(original_path) != os.path.normcase(cleaned_path) and os.path.exists(original_path):
            deleted = False
            for attempt in range(5):
                try:
                    os.remove(original_path)
                    print(f"🗑️ Deleted original: {filename_for_log}")
                    deleted = True
                    break
                except PermissionError:
                    # Wait briefly and try again in case a handle is being released
                    time.sleep(1)
                except Exception as e:
                    print(f"⚠️ Unexpected error deleting original '{filename_for_log}': {e}")
                    break
            if not deleted:
                print(f"⚠️ Could not delete original (in use): {filename_for_log}")
    except Exception as e:
        print(f"⚠️ Unexpected error checking original '{filename_for_log}': {e}")


for filename in os.listdir(folder_path):
    if filename.lower().endswith((".xls", ".xlsx")):
        file_path = os.path.join(folder_path, filename)
        file_root, _ = os.path.splitext(file_path)
        temp_file = file_root + "_temp.xlsx"
        cleaned_path = file_root + ".xlsx"

        try:
            # --- Step 1: Try to read the file using any available engine ---
            try:
                xls = read_excel_any_engine(file_path)
                # If the fallback returned a dict (HTML case)
                if isinstance(xls, dict):
                    df = xls["HTML"]
                    with pd.ExcelWriter(temp_file, engine="openpyxl") as writer:
                        df.to_excel(writer, index=False, sheet_name="Extracted")
                    os.replace(temp_file, cleaned_path)

                    # Ensure the original source (e.g., .xls) is removed when a new .xlsx is created
                    delete_original_if_different(file_path, cleaned_path, filename)

                    print(f"✅ Cleaned HTML-like Excel: {os.path.basename(cleaned_path)}")
                    continue
            except Exception as e:
                print(f"❌ Failed to read '{filename}': {e}")
                continue

            # --- Step 2: Normalize sheet names and find matches ---
            available_sheets = [s.strip().replace("\u00a0", " ") for s in xls.sheet_names]
            valid_sheets = [
                s for s in available_sheets
                if any(p.search(s) for p in patterns_to_keep)
            ]

            if not valid_sheets:
                # Close the Excel reader before skipping to avoid leaving file locked
                try:
                    close_method = getattr(xls, "close", None)
                    if callable(close_method):
                        close_method()
                except Exception:
                    pass
                print(f"⚠️ No matching sheets found in '{filename}'. Skipping.")
                continue

            # --- Step 3: Write matched sheets to new cleaned file ---
            with pd.ExcelWriter(temp_file, engine="openpyxl") as writer:
                for sheet_name in valid_sheets:
                    df = pd.read_excel(xls, sheet_name=sheet_name)
                    df.to_excel(writer, index=False, sheet_name=sheet_name[:31])

            # --- Step 4: Overwrite original safely (retry if locked) ---
            for _ in range(3):
                try:
                    os.replace(temp_file, cleaned_path)
                    break
                except PermissionError:
                    print(f"⚠️ '{filename}' is locked, retrying...")
                    time.sleep(1)
            else:
                print(f"❌ Could not overwrite '{filename}' after retries.")
                # Close before continue
                try:
                    close_method = getattr(xls, "close", None)
                    if callable(close_method):
                        close_method()
                except Exception:
                    pass
                continue

            # Close the Excel reader before deleting the original to release file lock
            try:
                close_method = getattr(xls, "close", None)
                if callable(close_method):
                    close_method()
            except Exception:
                pass

            # --- Step 5: Delete the original file if different from cleaned (e.g., original .xls) ---
            delete_original_if_different(file_path, cleaned_path, filename)

            print(f"✅ Cleaned and overwritten: {os.path.basename(cleaned_path)}")

        except Exception as e:
            print(f"❌ Error processing '{filename}': {e}")

🗑️ Deleted original: 2020-01-01.xls
✅ Cleaned and overwritten: 2020-01-01.xlsx
🗑️ Deleted original: 2020-01-02.xls
✅ Cleaned and overwritten: 2020-01-02.xlsx
🗑️ Deleted original: 2020-01-03.xls
✅ Cleaned and overwritten: 2020-01-03.xlsx
🗑️ Deleted original: 2020-01-06.xls
✅ Cleaned and overwritten: 2020-01-06.xlsx
🗑️ Deleted original: 2020-01-07.xls
✅ Cleaned and overwritten: 2020-01-07.xlsx
🗑️ Deleted original: 2020-01-08.xls
✅ Cleaned and overwritten: 2020-01-08.xlsx
🗑️ Deleted original: 2020-01-09.xls
✅ Cleaned and overwritten: 2020-01-09.xlsx
🗑️ Deleted original: 2020-01-10.xls
✅ Cleaned and overwritten: 2020-01-10.xlsx
🗑️ Deleted original: 2020-01-13.xls
✅ Cleaned and overwritten: 2020-01-13.xlsx
🗑️ Deleted original: 2020-01-14.xls
✅ Cleaned and overwritten: 2020-01-14.xlsx
🗑️ Deleted original: 2020-01-15.xls
✅ Cleaned and overwritten: 2020-01-15.xlsx
🗑️ Deleted original: 2020-01-16.xls
✅ Cleaned and overwritten: 2020-01-16.xlsx
🗑️ Deleted original: 2020-01-17.xls
✅ Cleaned and ov