## Load the meta data into a dataframe

In [2]:
import os
import shutil
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re
from exif import Image
from PIL import Image
from tqdm import tqdm
import subprocess
import json

# Paths configuration
PATH_SSD = "/media/tom-ratsakatika/CRUCIAL 4TB/"
raw_data_path = Path(PATH_SSD + 'camera_trap_raw/data')
processed_data_path = Path(PATH_SSD + 'datasets')
error_log_path = PATH_SSD + 'camera_trap_raw/error_log.txt'


In [2]:
df_labels = pd.read_excel(PATH_SSD + 'camera_trap_raw/labelled_data.xlsx')

# Ensure 'Date' is a datetime object
df_labels['Date'] = pd.to_datetime(df_labels['Date'], errors='coerce', format='%Y-%m-%d')

# Ensure 'Time' is converted to datetime; this may require correct formatting if actual data has errors like above
df_labels['Time'] = pd.to_datetime(df_labels['Time'], errors='coerce', format='%H:%M:%S').dt.time

# Create formatted date and time strings
df_labels['Formatted Date'] = df_labels['Date'].dt.strftime('%Y-%m-%d')
df_labels['Formatted Time'] = df_labels['Time'].astype(str).str[:5]  # Slicing to get HH:MM

# Concatenate formatted date and time with an underscore
df_labels['MatchID'] = df_labels['Formatted Date'] + '_' + df_labels['Formatted Time']

# Display the updated DataFrame
df_labels

  for idx, row in parser.parse():
  for idx, row in parser.parse():


Unnamed: 0,Sort,Session,Category,GMU,TrapSite,Longitude,Latitude,Altitude,StartDate,EndDate,...,Species,NoAnimals,Sex,Image quality,Problem,Sequence,Comments,Formatted Date,Formatted Time,MatchID
0,1,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Tamasului,25.172013,45.536521,1463,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Bear,1,u,medium,partly,6,date unknown,2018-01-01,NaT,2018-01-01_NaT
1,2,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Tamasului,25.172013,45.536521,1463,2018-02-15 00:00:00,2018-04-05 00:00:00,...,European hare,1,u,bad,overexposed,2,date unknown,2018-01-01,NaT,2018-01-01_NaT
2,3,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,medium,overexposed,1,pooing; date unknown,2018-01-01,NaT,2018-01-01_NaT
3,4,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,good,no problem,2,date unknown,2018-01-01,NaT,2018-01-01_NaT
4,5,1,Systematic monitoring of lynx,29PiatraCraiuluiBV,Otetelea,25.184649,45.527437,1515,2018-02-15 00:00:00,2018-04-05 00:00:00,...,Fox,1,u,bad,overexposed,1,date unknown,2018-01-01,NaT,2018-01-01_NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19022,19023,5,Systematic monitoring of feeding points,31BarsaBV,Valea Parului,25.22696,45.584392,880,2021-05-24 00:00:00,2021-06-15 00:00:00,...,Squirrel,1,u,medium,no problem,1,na,2021-06-05,06:06,2021-06-05_06:06
19023,19024,5,Systematic monitoring of feeding points,31BarsaBV,Valea Parului,25.22696,45.584392,880,2021-05-24 00:00:00,2021-06-15 00:00:00,...,Fox,1,u,medium,no problem,1,na,2021-06-07,23:20,2021-06-07_23:20
19024,19025,5,Systematic monitoring of feeding points,31BarsaBV,Valea Parului,25.22696,45.584392,880,2021-05-24 00:00:00,2021-06-15 00:00:00,...,Roe deer,1,m,medium,no problem,1,na,2021-06-08,09:03,2021-06-08_09:03
19025,19026,5,Systematic monitoring of feeding points,31BarsaBV,Valea Parului,25.22696,45.584392,880,2021-06-16 00:00:00,2021-06-16 00:00:00,...,Fox,1,u,medium,no problem,2,na,2021-06-16,07:15,2021-06-16_07:15


## Save all files with unique filenames in ../data/processed

In [8]:
processed_data_path.mkdir(parents=True, exist_ok=True)

# DataFrame to hold the file information
columns = ['File ID', 'Original Path', 'Original Filename', 'File extension', 'Original Date', 'Original Time', 'Camera Brand', 'Flash']
df_meta = pd.DataFrame(columns=columns)

# Collect all files to process for progress tracking
all_files = []
for root, dirs, files in os.walk(raw_data_path):
    for file in files:
        all_files.append((root, file))

# Processing files with progress bar
unique_id = 0  # Unique reference number for each file
for root, file in tqdm(all_files, desc="Copying files"):
    unique_id += 1
    file_path = Path(root) / file
    original_path = Path(root)
    extension = file.split('.')[-1]
    original_filename = file.rsplit('.', 1)[0]  # filename without the extension
    new_filename = f"{unique_id}.{extension}"
    shutil.copy(file_path, processed_data_path / new_filename)
    
    camera_brand, original_date, original_time = None, None, None  # Initialize to None
    # Determine processing based on file type
    if extension.lower() == 'jpg' or extension.lower() == 'jpeg' or extension.lower() == 'png':
        # Process image file
        try:
            with Image.open(file_path) as img:
                exif_data = img._getexif()
                
                if exif_data is not None:
                    camera_brand = exif_data.get(0x010F, None)  # 'Make' tag
                    created_on = exif_data.get(0x9003, None)  # 'DateTimeOriginal' tag

                    if created_on:
                        created_on = created_on.replace(':', '-', 2)  # Format date correctly
                        original_datetime = pd.to_datetime(created_on)
                        original_date = original_datetime.date()
                        original_time = original_datetime.strftime('%H:%M')

                    if camera_brand:
                        camera_brand = str(camera_brand).title()
        
        except Exception as e:
            with open(error_log_path, 'a') as file:
                file.write(f"Error processing image {file_path}: {e}\n")

#    elif extension.lower() == 'mp4' or extension.lower() == 'mov' or extension.lower() == 'm4v':
#        # Process video file
#        try:
#            # Using ffprobe to extract creation time from video files
#            cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
#                   '-show_entries', 'stream_tags=creation_time', '-of', 'default=noprint_wrappers=1:nokey=1', str(file_path)]
#            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
#            creation_time = result.stdout.strip()
#            if creation_time:
#                creation_time = pd.to_datetime(creation_time).strftime('%Y-%m-%d %H:%M')
#                original_date, original_time = creation_time.split(' ')
#        
#        except Exception as e:
#            with open(error_log_path, 'a') as file:
#                file.write(f"Error processing video {file_path}: {e}\n")

    # Prepare row for DataFrame
    row = [unique_id, original_path, original_filename, extension, original_date, original_time, camera_brand]
    df_meta.loc[unique_id] = row

df_meta['MatchID'] = df_meta['Original Date'].astype(str) + '_' + df_meta['Original Time'].astype(str)
df_meta.reset_index(drop=True, inplace=True)

# Save the DataFrame to a CSV file
df_meta.to_csv(PATH_SSD + 'camera_trap_raw/meta_data.csv', index=False)
print("Files copied and indexed successfully.")

df_meta


Copying files: 100%|██████████| 3983/3983 [01:07<00:00, 59.41it/s] 

Files copied and indexed successfully.





Unnamed: 0,File ID,Original Filename,File extension,Camera Brand,Original Date,Original Time,MatchID
0,1,am pastrat doar folderele cu poze,txt,,,,None_None
1,2,I_00021a,JPG,Cuddeback,2023-10-07,14:16,2023-10-07_14:16
2,3,I_00021b,JPG,Cuddeback,2023-10-07,14:16,2023-10-07_14:16
3,4,I_00045a,JPG,Cuddeback,2020-02-14,14:15,2020-02-14_14:15
4,5,I_00045b,JPG,Cuddeback,2020-02-14,14:15,2020-02-14_14:15
...,...,...,...,...,...,...,...
3978,3979,L__00001,JPG,Cuddeback,2023-10-26,11:34,2023-10-26_11:34
3979,3980,L__00001,JPG,Cuddeback,2023-10-27,11:34,2023-10-27_11:34
3980,3981,L__00001,JPG,Cuddeback,2023-11-11,10:34,2023-11-11_10:34
3981,3982,I__00014,JPG,Cuddeback,2023-11-07,19:57,2023-11-07_19:57


## Merge processed file names and metadata tables

In [9]:
# Perform an inner join based on the newly created datetime columns
#df_merged = pd.merge(df_meta, df_labels, left_on=['MatchID','Original Filename'], right_on=['MatchID','RawName'], how='inner')
df_merged = pd.merge(df_meta, df_labels, left_on=['MatchID'], right_on=['MatchID'], how='inner')

# Print the head of the resulting DataFrame to verify
df_merged

# Save the DataFrame to a CSV file
df_merged.to_csv('../data/structured_data.csv', index=False)

In [10]:
# Define the base paths
source_base_path = '../data/processed'
destination_base_path = '../data/processed'  # Adjust this path as necessary

# Make sure the destination base path exists
os.makedirs(destination_base_path, exist_ok=True)

# Loop through the filtered DataFrame
for index, row in df_merged.iterrows():
    file_name = f"{row['File ID']}.{row['File extension'].strip()}"
    species_folder = os.path.join(destination_base_path, row['Species'])

    # Create the species directory if it doesn't already exist
    os.makedirs(species_folder, exist_ok=True)

    # Construct full source and destination paths
    source_path = os.path.join(source_base_path, file_name)
    destination_path = os.path.join(species_folder, file_name)

    # Copy the file from source to destination
    if os.path.exists(source_path):  # Check if the file exists to avoid errors
        shutil.copy2(source_path, destination_path)
    else:
        print(f"File not found: {source_path}")



## Next steps

- Maybe ignore duplicates - images captured at the same time generally the same animal
- look to merge back into original labelled data excel sheet