# Application to Extract Image from Excel

## 1. Import Library

In [3]:
import os
import pandas as pd
import numpy as np
from openpyxl_image_loader import SheetImageLoader
from openpyxl import load_workbook
from openpyxl.drawing.image import Image
from PIL import Image
import io
import traceback
import re
import time
import gc
import xlwings as xw
from concurrent.futures import ThreadPoolExecutor, as_completed

## 2. Application to Extract Images from Excel

### 2.1. Function Codes

In [4]:
def unique_column_names(columns):
    """Ensure column names are unique by appending a suffix."""
    seen = {}
    new_columns = []
    for col in columns:
        if col in seen:
            seen[col] += 1
            new_columns.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            new_columns.append(col)
    return new_columns

def extract_images_from_excel(file_path, output_folder):
    """Extract images from columns containing 'DOKUMENTASI' in all sheets and save them."""
    try:
        start_time = time.time()
        wb = load_workbook(file_path, data_only=True)
        
        file_name_clean = os.path.splitext(os.path.basename(file_path))[0]
        
        total_images = 0
        extracted_images = 0
        
        for sheet_name in wb.sheetnames:
            sheet_start_time = time.time()
            ws = wb[sheet_name]
            image_loader = SheetImageLoader(ws)
            
            merged_cells = list(ws.merged_cells)
            if merged_cells:
                print(f"📄 Handling {len(merged_cells)} merged cells in sheet '{sheet_name}'")
                for merge in merged_cells:
                    ws.unmerge_cells(str(merge))
                    top_left = ws.cell(merge.min_row, merge.min_col).value
                    for row in range(merge.min_row, merge.max_row + 1):
                        for col in range(merge.min_col, merge.max_col + 1):
                            ws.cell(row, col, top_left)
            
            print(f"📄 Converting sheet '{sheet_name}' to DataFrame")
            data = list(ws.values)
            df = pd.DataFrame(data)
            
            try:
                header_index = df[df.apply(lambda x: x.astype(str).str.contains("NO", case=False, na=False)).any(axis=1)].index[0]
            except IndexError:
                print(f"⚠️ Could not identify header row in sheet '{sheet_name}', skipping...")
                continue
            
            df.columns = df.iloc[2].astype(str).str.strip()
            df = df.dropna(axis=1, how="all")
            df = df.loc[:, ~df.columns.str.contains("REKAP", case=False, na=False)]
            df = df.drop(index=[0, 1, 4]).reset_index(drop=True)
            
            merged_header = [a if a == b else f"{a} {b}" for a, b in zip(df.iloc[0], df.iloc[1])]
            df.columns = unique_column_names(merged_header)
            df = df.drop(index=[0, 1]).reset_index(drop=True)
            df.columns = df.columns.str.upper().str.strip()
            
            dokumentasi_cols = [col for col in df.columns if "DOKUMENTASI" in col.upper()]
            
            if not dokumentasi_cols:
                print(f"ℹ️ No 'DOKUMENTASI' columns found in sheet '{sheet_name}'")
                continue
            
            print(f"🔍 Found {len(dokumentasi_cols)} 'DOKUMENTASI' columns in sheet '{sheet_name}'")
            
            cells_with_images = []
            for row_idx in range(2, ws.max_row + 1):
                for col_idx, col_name in enumerate(df.columns):
                    if col_name not in dokumentasi_cols:
                        continue
                    excel_col_idx = col_idx + 1
                    cell_address = ws.cell(row=row_idx, column=excel_col_idx).coordinate
                    if image_loader.image_in(cell_address):
                        cells_with_images.append({
                            'cell_address': cell_address,
                            'col_name': col_name,
                            'row_idx': row_idx,
                        })
            
            sheet_images = len(cells_with_images)
            total_images += sheet_images
            
            if sheet_images == 0:
                print(f"ℹ️ No images found in 'DOKUMENTASI' columns in sheet '{sheet_name}'")
                continue
            
            print(f"🔍 Found {sheet_images} images in sheet '{sheet_name}'")
            sheet_extracted = 0
            
            for img_info in cells_with_images:
                try:
                    img = image_loader.get(img_info['cell_address'])
                    img_path = os.path.join(output_folder, f"{file_name_clean}_Sheet_{sheet_name}_Column_{img_info['col_name']}_Row_{img_info['row_idx']}.png")
                    img.save(img_path, format="PNG")
                    sheet_extracted += 1
                    extracted_images += 1
                except Exception as e:
                    print(f"❌ Error saving image at {img_info['cell_address']} in file '{file_name_clean}': {str(e)}")
            
            print(f"📊 Sheet '{sheet_name}': Extracted {sheet_extracted}/{sheet_images} images")
        
        wb.close()
        gc.collect()
        
        print(f"✅ Completed processing file: {file_name_clean}")
        print(f"📊 Total extracted {extracted_images}/{total_images} images")
        return True
    except Exception as e:
        print(f"❌ Error processing file '{file_path}': {str(e)}")
        return False

def process_excel_folder(folder_path, export_folder):
    """Process all Excel files in a folder and extract images from them."""
    output_folder = os.path.join(export_folder, "Extract Images")
    os.makedirs(output_folder, exist_ok=True)
    
    excel_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.xlsx', '.xlsm'))]
    
    if not excel_files:
        print("⚠️ No Excel files found in the specified folder.")
        return
    
    print(f"🔍 Found {len(excel_files)} Excel files to process.")
    print(f"🗂️ Images will be saved to: {output_folder}")
    
    successful_files = 0
    failed_files = []
    
    for i, file_path in enumerate(excel_files, 1):
        file_name = os.path.basename(file_path)
        print(f"\n📊 Processing file {i}/{len(excel_files)}: {file_name}")
        if extract_images_from_excel(file_path, output_folder):
            successful_files += 1
        else:
            failed_files.append(file_name)
    
    print("\n" + "="*50)
    print("📈 PROCESSING SUMMARY")
    print("="*50)
    print(f"Total files: {len(excel_files)}")
    print(f"Successfully processed: {successful_files}")
    print(f"Failed to process: {len(failed_files)}")
    print(f"Images saved to: {output_folder}")
    
    if failed_files:
        print("\nFiles that could not be processed:")
        for file in failed_files:
            print(f"- {file}")

### 2.2. Run Function

In [5]:
excel_folder = r"C:\Users\kanzi\Documents\Part Time Job\Automation Codes\Excel Folder"  # Fill with the path file of excel folder
export_folder = r"C:\Users\kanzi\Documents\Part Time Job\Automation Codes\check_photo"  # Fill with the path folder of export result
process_excel_folder(excel_folder, export_folder) # Run the function!

🔍 Found 32 Excel files to process.
🗂️ Images will be saved to: C:\Users\kanzi\Documents\Part Time Job\Automation Codes\check_photo\Extract Images

📊 Processing file 1/32: 01. Cileungsi - Cibeet.xlsx


  warn(msg)


📄 Handling 19 merged cells in sheet 'RAMBU'
📄 Converting sheet 'RAMBU' to DataFrame
🔍 Found 1 'DOKUMENTASI' columns in sheet 'RAMBU'
🔍 Found 172 images in sheet 'RAMBU'


KeyboardInterrupt: 