In [1]:
import os
import pandas as pd
import shutil

# Define file paths
source_dir = 'Images'  # Replace with your folder containing images
csv_file = 'Validation-XPO(in).csv'  # Replace with your CSV file path
output_dir = 'sorted_images'

# Read CSV file
csv_data = pd.read_csv(csv_file)

# Ensure the column with file names is named 'FileName'
# Replace 'FileName' with the actual column name in your CSV file if different
file_names = csv_data['FileName']

# Filter the file names by the given range (6102 to 7321)
filtered_file_names = file_names.iloc[6102:7322]

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Sort and copy files
for file_name in filtered_file_names:
    # Remove any unwanted extensions and ensure .tiff
    base_name = os.path.splitext(file_name)[0]  # Strip extension
    tiff_file_name = f"{base_name}.tiff"
    
    source_path = os.path.join(source_dir, tiff_file_name)
    if os.path.exists(source_path):
        shutil.copy(source_path, os.path.join(output_dir, tiff_file_name))
    else:
        print(f"File {tiff_file_name} not found in the source folder.")

print(f"Matching images have been extracted and saved to {output_dir}.")
# Count files in the directory
total_files = len([f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))])

print(f"Total items in the sorted_images folder: {total_files}")

Matching images have been extracted and saved to sorted_images.
Total items in the sorted_images folder: 1220


In [2]:
!pip install pytesseract


Defaulting to user installation because normal site-packages is not writeable




In [3]:


import os
import pytesseract
from PIL import Image
import re
import csv

# Define regex patterns for SRN Type and Value pairs
srn_patterns = [
    ("Cust PO", r"E[0-9A-Z]+"),  # Example: Cust PO and its value
    ("Probill", r"\d{9}"),      # Probill number pattern (9-digit number)
    ("Product Code", r"\d{8}"), # Product code pattern (8-digit number)
]

# Function to extract SRN Type and Value pairs from text
def extract_srn_data(text):
    srn_data = []
    for label, value_pattern in srn_patterns:
        matches = re.finditer(value_pattern, text)
        for match in matches:
            srn_data.append({"SRN Type": label, "SRN Value": match.group(0)})
    return srn_data

# Main function to process images in a folder and save to one CSV
def process_images_to_csv(folder_path, output_csv_path):
    # List to store all SRN data
    all_srn_data = []

    # Loop through all .tiff files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".tiff"):
            image_path = os.path.join(folder_path, file_name)

            # Extract text from the image using OCR
            try:
                text = pytesseract.image_to_string(Image.open(image_path))
                srn_data = extract_srn_data(text)

                # Append image name to each record
                for record in srn_data:
                    record["File Name"] = file_name

                all_srn_data.extend(srn_data)
                print(f"Processed {file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    # Save all SRN data to a single CSV file
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["File Name", "SRN Type", "SRN Value"])
        writer.writeheader()
        writer.writerows(all_srn_data)

if __name__ == "__main__":
    # Input folder containing .tiff images
    input_folder = "Images"

    # Output CSV file path
    output_csv = "output.csv"

    # Process the images and save to CSV
    process_images_to_csv(input_folder, output_csv)


Error processing 116827981.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117112800.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117136364.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117239183.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117549331.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117708183.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117708216.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 117716491.tiff: tesseract is not installed or it's not in your PATH. See README file for more information.
Error pr