## Load the meta data into a dataframe

In [1]:
import os
import shutil
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re
from exif import Imagels
from PIL import Image
from tqdm import tqdm
import subprocess
import json
# from datetime import datetime
from suntime import Sun
import datetime
import pytz  # This module provides timezone definitions


# Paths configuration

# PATH_ROOT = "/media/tom-ratsakatika/CRUCIAL 4TB/FCC Camera Trap Data"
PATH_ROOT = "/home/tom-ratsakatika/VSCode/camera-traps/data/"

raw_data_path = Path(PATH_ROOT + 'raw/')
processed_data_path = Path(PATH_ROOT + 'processed/')
error_log_path = Path(PATH_ROOT + 'error_log.txt')
labelled_data_path = Path(PATH_ROOT + 'labelled_data.xlsx')
meta_data_path = Path(PATH_ROOT + 'meta_data.xlsx')

In [2]:
df_labels = pd.read_excel(labelled_data_path)

# Ensure 'Date' is a datetime object
df_labels['Date'] = pd.to_datetime(df_labels['Date'], errors='coerce', format='%Y-%m-%d')

# Ensure 'Time' is converted to datetime
df_labels['Time'] = pd.to_datetime(df_labels['Time'], errors='coerce', format='%H:%M:%S').dt.time

# Create formatted date and time strings
df_labels['Formatted Date'] = df_labels['Date'].dt.strftime('%Y-%m-%d')
df_labels['Formatted Time'] = df_labels['Time'].astype(str).str[:5]  # Slicing to get HH:MM

# Concatenate formatted date and time with an underscore
df_labels['MatchID'] = df_labels['Formatted Date'] + '_' + df_labels['Formatted Time']

# Display the updated DataFrame
df_labels

  for idx, row in parser.parse():
  for idx, row in parser.parse():


Unnamed: 0,Sort,Session,Category,GMU,TrapSite,Longitude,Latitude,Altitude,StartDate,EndDate,...,Species,NoAnimals,Sex,Image quality,Problem,Sequence,Comments,Formatted Date,Formatted Time,MatchID
0,1,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Tamasului,25.172013,45.536521,1463,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Bear,1,u,medium,partly,6,date unknown,2018-01-01,NaT,2018-01-01_NaT
1,2,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Tamasului,25.172013,45.536521,1463,2018-02-15 00:00:00,2018-04-05 00:00:00,...,European hare,1,u,bad,overexposed,2,date unknown,2018-01-01,NaT,2018-01-01_NaT
2,3,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,medium,overexposed,1,"pooing, date unknown",2018-01-01,NaT,2018-01-01_NaT
3,4,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,good,no problem,2,date unknown,2018-01-01,NaT,2018-01-01_NaT
4,5,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,bad,overexposed,1,date unknown,2018-01-01,NaT,2018-01-01_NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10532,18849,5,Systematic monitoring of feeding points,20StoenestiAG,25_ColtulSoimului,25.25,45.24,1112,2021-03-22 00:00:00,2021-05-04 00:00:00,...,Bear,1,u,medium,no problem,2,na,2021-04-26,07:20,2021-04-26_07:20
10533,18850,5,Systematic monitoring of feeding points,20StoenestiAG,25_ColtulSoimului,25.25,45.24,1112,2021-03-22 00:00:00,2021-05-04 00:00:00,...,Bear,1,u,medium,no problem,4,na,2021-04-27,18:10,2021-04-27_18:10
10534,18851,5,Systematic monitoring of feeding points,20StoenestiAG,25_ColtulSoimului,25.25,45.24,1112,2021-03-22 00:00:00,2021-05-04 00:00:00,...,Roe deer,1,m,medium,no problem,2,na,2021-04-29,10:43,2021-04-29_10:43
10535,18852,5,Systematic monitoring of feeding points,20StoenestiAG,25_ColtulSoimului,25.25,45.24,1112,2021-03-22 00:00:00,2021-05-04 00:00:00,...,Bear,1,u,medium,no problem,2,na,2021-04-30,16:34,2021-04-30_16:34


## Save all files with unique filenames in ../data/processed

In [4]:
# Constants
BRASOV_LAT, BRASOV_LON = 45.6580, 25.6012
SUN = Sun(BRASOV_LAT, BRASOV_LON)
ROMANIA_TIMEZONE = pytz.timezone('Europe/Bucharest')
unique_id = 0

# DataFrame setup
COLUMNS = ['File ID', 'Original Path', 'Original Filename', 'File Extension', 'Original Date', 'Original Time', 'Camera Brand', 'Color Space', 'Flash Used', 'Flash Info', 'Day/Night', 'Camera ID', 'Sunrise', 'Sunset']
df_meta = pd.DataFrame(columns=COLUMNS)

def extract_camera_id(user_comments):
    if user_comments is None:
        return None
    try:
        if isinstance(user_comments, bytes):
            decoded_comments = user_comments.decode('utf-8')
        else:
            decoded_comments = user_comments

        parts = decoded_comments.split(',')
        for part in parts:
            if part.startswith('ID='):
                camera_id = part.split('=')[1].strip()  # Remove whitespace
                # If camera ID exceeds expected length, truncate it
                return camera_id[:6] if len(camera_id) > 6 else camera_id
    except Exception as e:
        print(f"Failed to decode comments with error: {e}")
        return None

def safe_convert_datetime(exif_date_str):
    try:
        if exif_date_str:
            return pd.to_datetime(exif_date_str.replace(':', '-', 2))
    except Exception as e:
        print(f"Failed to convert date: {exif_date_str} with error: {e}")
    return None

def classify_day_night(photo_datetime):
    if photo_datetime:
        # First make it aware in the local timezone.
        local_photo_datetime = ROMANIA_TIMEZONE.localize(photo_datetime)
        
        # Now convert the photo datetime to UTC
        utc_photo_datetime = local_photo_datetime.astimezone(pytz.utc)

        # Get sunrise and sunset times in UTC
        utc_sunrise = SUN.get_sunrise_time(utc_photo_datetime)
        utc_sunset = SUN.get_sunset_time(utc_photo_datetime)

        # Convert sunrise and sunset from UTC to local Romanian time
        local_sunrise = utc_sunrise.astimezone(ROMANIA_TIMEZONE)
        local_sunset = utc_sunset.astimezone(ROMANIA_TIMEZONE)

        # Correct the sunset time if it's before the sunrise time
        if local_sunset < local_sunrise:
            local_sunset += datetime.timedelta(days=1)

        # Determine if it's day or night using the local times
        if not (local_sunrise <= local_photo_datetime <= local_sunset):
            return 'Night', local_sunrise, local_sunset
        else:
            return 'Day', local_sunrise, local_sunset
    return None

def process_image_file(file_path):
    try:
        with Image.open(file_path) as img:
            exif_data = img._getexif() or {}
            camera_brand = exif_data.get(271, None)
            created_on = exif_data.get(36867, None)
            user_comments = exif_data.get(37510, None)
            flash_details = exif_data.get(37385, None)

            original_datetime = safe_convert_datetime(created_on)
            day_night, sunrise, sunset  = classify_day_night(original_datetime)
            camera_id = extract_camera_id(user_comments)
            color_space = 'RGB' if exif_data.get(40961) == 1 else None
            flash_used = 'Y' if flash_details and flash_details & 0b0001 else 'N'
            flash_info = flash_details if flash_details is not None else "No Flash Info"

            return (camera_brand, original_datetime, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset)
    except Exception as e:
        log_error(e)
        return (None, None, None, None, None, None, None, None, None)

def log_error(message):
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(error_log_path, 'a') as file:
        file.write(f"{current_time}: {message}\n")

def collect_files(path):
    return [(root, file) for root, _, files in os.walk(path) for file in files]

# Collecting files and processing
all_files = collect_files(raw_data_path)

for root, file in tqdm(all_files, desc="Indexing files"):

    # Skip files from __MACOSX directory or files starting with '.'
    if '__MACOSX' in root or file.startswith('.'):
        continue

    unique_id += 1
    file_path = Path(root) / file
    extension = file.split('.')[-1]
    original_filename = file.rsplit('.', 1)[0]

    if extension.lower() in ['jpg', 'jpeg', 'png']:
        camera_brand, original_datetime, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset = process_image_file(file_path)
        original_date = original_datetime.date() if original_datetime else None
        original_time = original_datetime.strftime('%H:%M') if original_datetime else None

    # Video processing code goes here

    df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]

df_meta['MatchID'] = df_meta['Original Date'].astype(str) + '_' + df_meta['Original Time'].astype(str)
df_meta.reset_index(drop=True, inplace=True)

  df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]
  df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]
  df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]
  df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]
  df_meta.loc[unique_id] = [unique_id, root, original_filename, extension, original_date, original_time, camera_brand, color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset]
  df_meta.loc[unique_id] 

In [5]:
# Save the DataFrame to a CSV file
df_meta.to_csv(PATH_ROOT + 'meta_data.csv', index=False)
print("Files copied and indexed successfully.")

df_meta

Files copied and indexed successfully.


Unnamed: 0,File ID,Original Path,Original Filename,File Extension,Original Date,Original Time,Camera Brand,Color Space,Flash Used,Flash Info,Day/Night,Camera ID,Sunrise,Sunset,MatchID
0,1,/home/tom-ratsakatika/VSCode/camera-traps/data...,IMG-20181119-WA0001- Stramtu,jpg,,,,,,,,,NaT,NaT,None_None
1,2,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00007,JPG,2019-02-08,21:23,CUDDEBACK,RGB,Y,1,Night,CC0165,2019-02-08 07:31:12+02:00,2019-02-08 17:33:36+02:00,2019-02-08_21:23
2,3,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00023,JPG,2019-01-04,22:26,CUDDEBACK,RGB,Y,1,Night,CC0165,2019-01-04 07:58:48+02:00,2019-01-04 16:46:48+02:00,2019-01-04_22:26
3,4,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00050,JPG,2019-03-31,03:22,CUDDEBACK,RGB,Y,1,Night,CC0165,2019-03-31 07:01:12+03:00,2019-03-31 19:43:48+03:00,2019-03-31_03:22
4,5,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00052,JPG,2019-04-01,21:00,CUDDEBACK,RGB,Y,1,Night,CC0165,2019-04-01 06:58:48+03:00,2019-04-01 19:45:00+03:00,2019-04-01_21:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50516,50517,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00048,JPG,2021-03-27,07:55,CUDDEBACK,RGB,N,0,Day,CC016C,2021-03-27 06:08:24+02:00,2021-03-27 18:38:24+02:00,2021-03-27_07:55
50517,50518,/home/tom-ratsakatika/VSCode/camera-traps/data...,I__00054,JPG,2021-03-28,01:36,CUDDEBACK,RGB,Y,1,Night,CC016C,2021-03-27 06:08:24+02:00,2021-03-27 18:38:24+02:00,2021-03-28_01:36
50518,50519,/home/tom-ratsakatika/VSCode/camera-traps/data...,I_00010a,JPG,2021-03-31,14:22,CUDDEBACK,RGB,N,0,Day,CC014H,2021-03-31 07:01:12+03:00,2021-03-31 19:43:48+03:00,2021-03-31_14:22
50519,50520,/home/tom-ratsakatika/VSCode/camera-traps/data...,I_00010b,JPG,2021-03-31,14:22,CUDDEBACK,RGB,N,0,Day,CC014H,2021-03-31 07:01:12+03:00,2021-03-31 19:43:48+03:00,2021-03-31_14:22


## Merge processed file names and metadata tables

In [6]:
# Perform an inner join based on the newly created datetime columns
#df_merged = pd.merge(df_meta, df_labels, left_on=['MatchID','Original Filename'], right_on=['MatchID','RawName'], how='inner')
df_merged = pd.merge(df_labels, df_meta, left_on=['MatchID'], right_on=['MatchID'], how='inner')

# Print the head of the resulting DataFrame to verify
df_merged

# Save the DataFrame to a CSV file
df_merged.to_csv('../data/structured_data.csv', index=False)

In [None]:
# Perform an inner join based on the newly created datetime columns
#df_merged_on_labels = pd.merge(df_meta, df_labels, left_on=['MatchID','Original Filename'], right_on=['MatchID','RawName'], how='inner')
df_merged_on_labels = pd.merge(df_labels, df_meta, left_on=['MatchID'], right_on=['MatchID'], how='left')

# Print the head of the resulting DataFrame to verify
df_merged_on_labels

# Save the DataFrame to a CSV file
df_merged_on_labels.to_csv('../data/merged_on_labels.csv', index=False)

## Code to copy images only from SSD

In [None]:
import os
import shutil
from tqdm import tqdm

def copy_images(src_dir, dest_dir, extensions=('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
    """
    Copy image files from src_dir to dest_dir while maintaining the directory structure.
    Only files with specified extensions are copied.
    Includes a progress bar for better user feedback.
    """
    # Prepare a list of files to copy
    files_to_copy = []
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if file.lower().endswith(extensions):
                files_to_copy.append(os.path.join(root, file))

    # Ensure destination directory exists
    os.makedirs(dest_dir, exist_ok=True)

    # Copy files with a progress bar
    for file_path in tqdm(files_to_copy, desc="Copying images", unit="files"):
        relative_path = os.path.relpath(os.path.dirname(file_path), src_dir)
        dest_path = os.path.join(dest_dir, relative_path, os.path.basename(file_path))
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        shutil.copy2(file_path, dest_path)

# Define source and destination paths
        
src_directory = "/media/tom-ratsakatika/CRUCIAL 4TB/FCC Camera Trap Data/raw/CameraTrapsBackup_S5_20220304/Season5 - animals only"
dest_directory = "/home/tom-ratsakatika/VSCode/camera-traps/data/raw/Season5 - animals only"

# Copy images from source to destination
copy_images(src_directory, dest_directory)


## Get metadata from photo/video

In [None]:
from PIL import Image

def print_image_metadata(file_path):
    with Image.open(file_path) as img:
        exif_data = img._getexif()
        if exif_data:
            for tag, value in exif_data.items():
                print(f"{tag}: {Image.ExifTags.TAGS.get(tag, tag)} - {value}")

image_file_path = PATH_ROOT + 'raw/Season5 - animals only/21IzvoareleDamboviteiAG/Paraul_Manastirii/20220316_20220319/100EK113/03160203.JPG'
print_image_metadata(image_file_path)


In [None]:
import subprocess

def print_video_metadata(file_path):
    cmd = ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', str(file_path)]
    result = subprocess.run(cmd, capture_output=True, text=True)
    print(result.stdout)

video_file_path = 'path_to_your_video_file.mp4'
print_video_metadata(video_file_path)


## Next steps

- Maybe ignore duplicates - images captured at the same time generally the same animal
- look to merge back into original labelled data excel sheet

In [None]:
def extract_camera_id(user_comments):
    # Assuming user comments are a byte string that needs decoding
    try:
        decoded_comments = user_comments.decode('utf-8')
        parts = decoded_comments.split(',')
        for part in parts:
            if part.startswith('ID='):
                return part.split('=')[1]
    except:
        return None
    
def safe_convert_datetime(exif_date_str):
    try:
        # Correct the format from 'YYYY:MM:DD HH:MM:SS' to 'YYYY-MM-DD HH:MM:SS'
        if exif_date_str:
            formatted_date_str = exif_date_str.replace(':', '-', 2)
            return pd.to_datetime(formatted_date_str, errors='coerce')
    except Exception as e:
        print(f"Failed to convert date: {exif_date_str} with error: {e}")
    return None

# Location coordinates for Brașov, Romania
brasov_lat, brasov_lon = 45.6580, 25.6012
sun = Sun(brasov_lat, brasov_lon)

def is_night(photo_datetime, sunrise, sunset):
    return not (sunrise <= photo_datetime <= sunset)



processed_data_path.mkdir(parents=True, exist_ok=True)

# DataFrame to hold the file information
columns = ['File ID', 'Original Path', 'Original Filename', 'File extension', 'Original Date', 'Original Time', 'Camera Brand',  'color_space', 'flash_used', 'flash_info', 'day_night', 'camera_id', 'sunrise', 'sunset']
df_meta = pd.DataFrame(columns=columns)

# Collect all files to process for progress tracking
all_files = []
for root, dirs, files in os.walk(raw_data_path):
    for file in files:
        all_files.append((root, file))

# Processing files with progress bar
unique_id = 0  # Unique reference number for each file
for root, file in tqdm(all_files, desc="Indexing files"):

    camera_brand, original_date, original_time, color_space, flash_used, \
    flash_info, day_night, camera_id = None, None, None, None, None, None, None, None

    unique_id += 1
    file_path = Path(root) / file
    original_path = Path(root)
    extension = file.split('.')[-1]
    original_filename = file.rsplit('.', 1)[0]  # filename without the extension
    
    # Determine processing based on file type
    if extension.lower() == 'jpg' or extension.lower() == 'jpeg' or extension.lower() == 'png':
        # Process image file
        try:
            with Image.open(file_path) as img:
                exif_data = img._getexif()
                
                if exif_data is not None:
                    camera_brand = exif_data.get(271, None)  # 'Make' tag using decimal notation
                    created_on = exif_data.get(36867, None)  # 'DateTimeOriginal' tag using decimal notation

                    if created_on:
                        original_datetime = safe_convert_datetime(created_on)
                        original_date = original_datetime.date()
                        original_time = original_datetime.strftime('%H:%M')

                    if camera_brand:
                        camera_brand = str(camera_brand).title()

                    # Color space extraction
                    color_space = 'RGB' if exif_data.get(40961) == 1 else exif_data.get(40961) # 'Unknown'

                    # Flash usage and detailed info
                    flash_details = exif_data.get(37385)
                    if flash_details is not None:
                        flash_used = 'Y' if flash_details & 0b0001 else 'N'
                        flash_info = flash_details
                    else:
                        flash_used = 'N'
                        flash_info = "No Flash Info"

                    # Day/Night classification based on DateTimeOriginal
                        
                    # Extract date and time the photo was originally taken
                    created_on_formatted = safe_convert_datetime(created_on)  # Safely convert the datetime

                    if created_on_formatted:
                        hour = created_on_formatted.hour
                        day_night = 'Night' if 19 <= hour or hour < 7 else 'Day'

                    if original_datetime:
                        # Calculate sunrise and sunset for the date of the photo
                        sunrise = sun.get_sunrise_time(original_datetime).replace(tzinfo=None)
                        sunset = sun.get_sunset_time(original_datetime).replace(tzinfo=None)

                        # Determine if it's night or day
                        day_night = 'Night' if is_night(original_datetime, sunrise, sunset) else 'Day'

                    # Camera ID from user comments
                    user_comments = exif_data.get(37510)
                    if user_comments:
                        camera_id = extract_camera_id(user_comments)
        
        except Exception as e:
            import datetime
            current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            error_message = f"{current_time}: {e}\n"
            with open(error_log_path, 'a') as file:
                file.write(error_message)

    elif extension.lower() == 'mp4' or extension.lower() == 'mov' or extension.lower() == 'm4v':
        # Process video file
        try:
            # Using ffprobe to extract creation time from video files
            cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
                   '-show_entries', 'stream_tags=creation_time', '-of', 'default=noprint_wrappers=1:nokey=1', str(file_path)]
            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            creation_time = result.stdout.strip()
            if creation_time:
                creation_time = pd.to_datetime(creation_time).strftime('%Y-%m-%d %H:%M')
                original_date, original_time = creation_time.split(' ')
        
        except Exception as e:
            import datetime
            current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            error_message = f"{current_time}: {e}\n"
            with open(error_log_path, 'a') as file:
                file.write(error_message)

    # Prepare row for DataFrame

    row = [
        unique_id, original_path, original_filename, extension, original_date, original_time, camera_brand,
        color_space, flash_used, flash_info, day_night, camera_id, sunrise, sunset
    ]

    df_meta.loc[unique_id] = row

df_meta['MatchID'] = df_meta['Original Date'].astype(str) + '_' + df_meta['Original Time'].astype(str)
df_meta.reset_index(drop=True, inplace=True)
