In [None]:
"""
Camera trap working text extraction pipeline Shaotai Hu
Created on Sunday Jun 8 15:17:45 2025
@author: Shaotai Hu
""";

## Next Steps (also in readme on github) ##
Update the dataframe to correct format (done): 
- tag the extracted name with og image name in the front: ogname_tree_temp...
- so dataframe can be ogname, tree, temp ...
- account for all cases of different image names and extraction name outputs
### - this way the cnn output (yes, no, how many infants) can be paired with the extraction pipeline dataframe output
### - allowing the cnn outputs to be added to the end of the dataframe in columns yes/no, #ofinfants to the correct corresponding image, by matching ogname, which we can call image id.

In [None]:
import os
import pandas as pd
import re
from PIL import Image

In [None]:
"""
def convert_images_in_place(input_dir, target_format='PNG'):
    for filename in os.listdir(input_dir):
        file_ext = os.path.splitext(filename)[1].lower()
        if file_ext in ('.jpg', '.jpeg', '.bmp', '.tiff', '.gif', '.png'):
            input_path = os.path.join(input_dir, filename)
            try:
                with Image.open(input_path) as img:
                    img = img.convert('RGB')
                    base_name = os.path.splitext(filename)[0]
                    new_filename = f"{base_name}.{target_format.lower()}"
                    new_path = os.path.join(input_dir, new_filename)
                    img.save(new_path, format=target_format.upper())
                    print(f"Converted: {filename} → {new_filename}")
                    if new_filename != filename:
                        os.remove(input_path)
                        print(f"Deleted original: {filename}")
            except Exception as e:
                print(f"Failed to convert {filename}: {e}")

input_dir = "raw"
convert_images_in_place(input_dir, target_format='PNG')
"";

In [None]:
def crop_custom_area(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
            input_path = os.path.join(input_dir, filename)
            try:
                with Image.open(input_path) as img:
                    width, height = img.size
                    if "####" in filename:
                        # frame removal for ####
                        left = width // 6
                        top = height - height // 15
                        right = width
                        bottom = height
                        cropped_img = img.crop((left, top, right, bottom))
                        cropped_width, cropped_height = cropped_img.size
                        # vertical strip removal for ####
                        strips = [
                            #(int(width * 52 / 100), int(width * 80 / 100)),
                            #(int(width * 35 / 100), int(width * 44 / 100))
                        ]
                    else:
                        # default frame removal
                        left = width // 6
                        top = height - height // 23
                        right = width
                        bottom = height
                        cropped_img = img.crop((left, top, right, bottom))
                        cropped_width, cropped_height = cropped_img.size
                        # default vertical strip removal
                        strips = [
                            (int(width * 52 / 100), int(width * 80 / 100)),
                            (int(width * 35 / 100), int(width * 44 / 100))
                        ]
                    adjusted_strips = []
                    for start, end in strips:
                        adj_start = max(start - left, 0)
                        adj_end = min(end - left, cropped_width)
                        if adj_start < adj_end:
                            adjusted_strips.append((adj_start, adj_end))
                    if not adjusted_strips:
                        new_img = cropped_img
                    else:
                        adjusted_strips.sort()
                        parts = []
                        current_x = 0
                        for start, end in adjusted_strips:
                            if start > current_x:
                                part = cropped_img.crop((current_x, 0, start, cropped_height))
                                parts.append(part)
                            current_x = end
                        if current_x < cropped_width:
                            part = cropped_img.crop((current_x, 0, cropped_width, cropped_height))
                            parts.append(part)
                        new_width = sum(part.width for part in parts)
                        new_img = Image.new('RGB', (new_width, cropped_height))
                        x_offset = 0
                        for part in parts:
                            new_img.paste(part, (x_offset, 0))
                            x_offset += part.width
                    output_path = os.path.join(output_dir, filename)
                    new_img.save(output_path)
                    print(f"Cropped and saved: {output_path}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

input_dir = "raw"
output_dir = "cut_ims"
crop_custom_area(input_dir, output_dir)

In [None]:
import pytesseract

def sanitize_filename(text, max_length=50):
    text = re.sub(r'[^\w\s-]', '', text).strip().replace(' ', '_')
    return text[:max_length] or "untitled"

def extract_text_and_rename_images(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            input_path = os.path.join(input_dir, filename)
            try:
                image = Image.open(input_path)
                extracted_text = pytesseract.image_to_string(image)
                new_name = sanitize_filename(extracted_text)
                
                if '####1' in new_name:
                    new_name = new_name.replace('####1', '####2')

                if '#1' in new_name:
                    new_name = new_name.replace('#1', '#2')

                output_filename = f"{filename}---{new_name}.png"
                output_filename = output_filename.replace(' ', '_')
                count = 1
                while os.path.exists(os.path.join(output_dir, output_filename)):
                    output_filename = f"{filename}---{new_name}_{count}.png"
                    output_filename = output_filename.replace(' ', '_')
                    count += 1
                output_path = os.path.join(output_dir, output_filename)
                image.save(output_path)
                print(f"Saved: {output_path}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

input_dir = "cut_ims"
output_dir = "output"
extract_text_and_rename_images(input_dir, output_dir)

In [None]:
def parse_filename(filename):
    base = os.path.splitext(filename)[0]
    parts = re.split(r'_+', base)

    # format 1: tempc_tree/camera_day_month_year_time_pm (7 part)
    if len(parts) >= 7 and re.match(r'^\d{1,2}[CF]$', parts[0]):
        temperature = parts[0]
        tree_camera = parts[1]
        date = '_'.join(parts[2:5])
        time = f"{parts[5]}{parts[6]}"
        return (tree_camera, temperature, date, time)

    # format 2: tree_camera_tempf_date_time (5 part)
    elif len(parts) >= 5 and re.match(r'^\d{6,8}$', parts[-2]) and re.match(r'^\d{4,6}$', parts[-1]):
        temperature = parts[-3]
        date = parts[-2]
        time = parts[-1]
        tree_camera = '_'.join(parts[:-3])  
        return (tree_camera, temperature, date, time)

    # format 3: tree/camera_tempf_date_time (4 part)
    elif len(parts) == 4:
        tree_camera = parts[0]
        temperature = parts[1]
        date = parts[2]
        time = parts[3]
        return (tree_camera, temperature, date, time)
    return None

def filenames_to_dataframe(input_dir):
    data = []
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
            try:
                image_part, camera_part = filename.split('---', 1)
                parsed = parse_filename(camera_part)
                if parsed:
                    tree_camera, temperature, date, time = parsed
                    data.append((image_part, tree_camera, temperature, date, time))
                else:
                    print(f"Filename format unexpected in: {filename}")
            except ValueError:
                print(f"Skipping improperly formatted filename: {filename}")
    df = pd.DataFrame(data, columns=['image', 'tree/camera', 'temperature', 'date', 'time'])
    return df

def save_dataframe_to_excel(df, output_dir, filename='extracted_camera_traps.xlsx'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_path = os.path.join(output_dir, filename)
    df.to_excel(output_path, index=False)
    print(f"DataFrame saved to {output_path}")

input_dir = "output"
output_dir = "xlsx"
df = filenames_to_dataframe(input_dir)
save_dataframe_to_excel(df, output_dir)
df

In [None]:
# alter version for virtual env conda installation, not using brew, for lab computer
# conda install -c conda-forge pytesseract tesseract