In [10]:
import os
import sys

import numpy as np
import pandas as pd
from pathlib import Path


In [2]:

label_root = "../../Labelling/Lemurs/labelling_app_indID/raw_labels/"

files_in_label_root = os.listdir(label_root)

group = "R1"

#filename = next((f for f in files_in_label_root if exp in f), None)
# Initialize an empty DataFrame with the specified columns
columns = ['trackNumber', 'trackId', 'xCoord', 'yCoord', 'width', 'height', 'confidenceTrack', 'species', 'nameOrder', 'confidenceId', 'experiment']
combined_df = pd.DataFrame(columns=columns)

files_starting_with_A = [f for f in files_in_label_root if f.startswith(group[0])]

for i, filename in enumerate(files_starting_with_A):

    file_path = f"{label_root}/{filename}"

    with open(file_path, "r") as file:
        file_content = file.readlines()

    metadata = [line.strip("#").strip() for line in file_content if line.startswith("#")]
    metadata_dict = dict(item.split(": ") for item in metadata)

    username = metadata_dict["username"]
    editDate = metadata_dict["editDate"]
    orderedNames = metadata_dict["orderedNames"].split(", ")
    if i > 0:
        if orderedNames != previous_orderedNames:
            print(f"Warning: orderedNames in {filename} do not match the previous file.")
    previous_orderedNames = orderedNames
    dataColumns = metadata_dict["dataColumns"].split(", ")

    df_id = pd.read_csv(file_path, skiprows=len(metadata), names=dataColumns).sort_values(["species", "trackId", "trackNumber"])

    filtered_df_id = df_id[df_id['species'] == 0]

    filtered_df_id['experiment'] = '_'.join(filename.split('_', 3)[:3])

    # Append the filtered DataFrame to the combined DataFrame
    combined_df = pd.concat([combined_df, filtered_df_id], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_id['experiment'] = '_'.join(filename.split('_', 3)[:3])


In [3]:
# Get the indices for 'Uns' and 'Unsure' from the orderedNames list
uns_index = orderedNames.index('Uns')
unsure_index = orderedNames.index('Unsure')

# Merge the nameOrder numbers that belong to Uns and Unsure
combined_df['nameOrder'] = combined_df['nameOrder'].replace({uns_index: unsure_index})



In [4]:
name_order_counts = combined_df['nameOrder'].value_counts()
name_order_counts.index = name_order_counts.index.map(lambda x: f"{orderedNames[x]} ({x})")
print(name_order_counts)
print(orderedNames)

Unsure (7)    217096
Ata (3)       140153
Zemlya (4)    114344
Gerald (6)     98539
Yaya (1)       57064
Novaya (2)     43452
Tiwi (0)       12243
Croker (5)      6361
Name: nameOrder, dtype: int64
['Tiwi', 'Yaya', 'Novaya', 'Ata', 'Zemlya', 'Croker', 'Gerald', 'Unsure', 'Uns']


In [5]:
import re

# Get nameOrders with counts under 1000
under_1000_nameOrders = name_order_counts[name_order_counts < 1000].index
under_1000_nameOrders = under_1000_nameOrders.map(lambda x: int(re.search(r'\((\d+)\)', x).group(1)))
print(under_1000_nameOrders)


Index([], dtype='object')


In [6]:

# Filter out the rows where nameOrder corresponds to 'Unsure'
filtered_combined_df = combined_df[~combined_df['nameOrder'].isin(under_1000_nameOrders)]

# Sample 1000 examples from each nameOrder
sampled_df = filtered_combined_df.groupby('nameOrder').apply(lambda x: x.sample(n=1000, replace=True)).reset_index(drop=True)



# Get all rows for nameOrders with counts under 1000
under_1000_df = combined_df[combined_df['nameOrder'].isin(under_1000_nameOrders)]

# Combine with the sampled DataFrame
sampled_df = pd.concat([sampled_df, under_1000_df], ignore_index=True)

# Reorder sampled_df by the columns experiment and trackNumber
sampled_df = sampled_df.sort_values(by=['experiment', 'trackNumber']).reset_index(drop=True)

name_order_counts = sampled_df['nameOrder'].value_counts()
name_order_counts.index = name_order_counts.index.map(lambda x: f"{orderedNames[x]} ({x})")
print(name_order_counts)

Zemlya (4)    1000
Gerald (6)    1000
Yaya (1)      1000
Unsure (7)    1000
Ata (3)       1000
Croker (5)    1000
Novaya (2)    1000
Tiwi (0)      1000
Name: nameOrder, dtype: int64


In [7]:
sampled_df.head(10)

Unnamed: 0,trackNumber,trackId,xCoord,yCoord,width,height,confidenceTrack,species,nameOrder,confidenceId,experiment
0,3461,4,234.671392,279.814275,521.384865,275.683442,0.76426,0,4,1,R_e1_c1
1,3572,4,29.211853,310.175364,333.671319,296.926842,0.986672,0,4,1,R_e1_c1
2,3577,4,28.992914,311.588947,334.202115,295.85492,0.968546,0,4,1,R_e1_c1
3,3590,4,28.105295,303.405775,356.959731,307.016857,0.988491,0,4,1,R_e1_c1
4,3735,4,84.034239,322.985789,298.197,240.702589,0.974689,0,4,1,R_e1_c1
5,4095,4,81.969072,299.009562,237.222348,256.944192,0.9739,0,4,1,R_e1_c1
6,4279,4,91.521404,309.591483,232.612228,239.774047,0.968654,0,4,1,R_e1_c1
7,4427,4,71.871896,320.208169,243.597747,233.300055,0.970734,0,4,1,R_e1_c1
8,4436,4,74.53862,316.353269,244.838446,239.166019,0.97743,0,4,1,R_e1_c1
9,4566,4,133.064824,299.92661,192.722297,245.847301,0.95455,0,4,1,R_e1_c1


In [8]:
import cv2

path_to_videos = f"/usr/users/vogg/sfb1528s3/B06/2023april-july/NewBoxesClosed/Converted/{group}/"

group_folder = os.path.join(label_root, "..", group)
os.makedirs(group_folder, exist_ok=True)

images_folder = os.path.join(group_folder, "images")
labels_with_ids_folder = os.path.join(group_folder, "labels_with_ids")

os.makedirs(images_folder, exist_ok=True)
os.makedirs(labels_with_ids_folder, exist_ok=True)

for i in range(len(sampled_df)):
    if i % 1000 == 0:
        print(f"Processing row {i}")
    row = sampled_df.iloc[i]

    # Normalize coordinates and dimensions
    xCoord_norm = row['xCoord'] / 1920
    yCoord_norm = row['yCoord'] / 1080
    width_norm = row['width'] / 1920
    height_norm = row['height'] / 1080

    filename = f"{row['experiment']}_{row['trackNumber']}.txt"
    file_path = os.path.join(labels_with_ids_folder, filename)

    with open(file_path, 'a') as f:
        f.write(f"0 0 {xCoord_norm} {yCoord_norm} {width_norm} {height_norm} {row['nameOrder']}\n")

    if i == 0 or row['experiment'] != sampled_df.iloc[i-1]['experiment']:
        if i > 0:
            if cap.isOpened():
                cap.release()
        video_path = os.path.join(path_to_videos, f"{row['experiment']}.mp4")
        cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
    else:
        frame_number = int(row['trackNumber'])
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()
        if ret:
            image_filename = f"{row['experiment']}_{row['trackNumber']}.png"
            image_path = os.path.join(images_folder, image_filename)
            cv2.imwrite(image_path, frame)
            #print(f"Frame {frame_number} saved as {image_path}")
        else:
            print(f"Error: Could not read frame {frame_number}")


Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000
Processing row 7000


### Make .train and .val files

In [23]:
from collections import Counter

data_root = Path("/usr/users/vogg/Labelling/Lemurs/labelling_app_indID/R1/")
label_list = [item for item in os.listdir(data_root / "labels_with_ids") if not item.startswith(".")]
print(len(label_list))
print(label_list[:5])

# Group by the first 7 letters of each filename
grouped_filenames = [filename[:7] for filename in label_list]
grouped_counts = Counter(grouped_filenames)

print(grouped_counts)

7809
['R_e2_c3_14383.txt', 'R_e6_c3_9118.txt', 'R_e4_c3_20939.txt', 'R_e2_c1_13709.txt', 'R_e1_c2_17443.txt']
Counter({'R_e1_c2': 612, 'R_e6_c3': 585, 'R_e1_c1': 542, 'R_e1_c3': 534, 'R_e1_c4': 498, 'R_e6_c2': 433, 'R_e2_c4': 431, 'R_e2_c3': 400, 'R_e3_c2': 388, 'R_e3_c3': 385, 'R_e2_c2': 363, 'R_e3_c4': 362, 'R_e2_c1': 350, 'R_e6_c1': 335, 'R_e3_c1': 306, 'R_e6_c4': 224, 'R_e5_c1': 207, 'R_e5_c2': 201, 'R_e4_c4': 168, 'R_e5_c3': 117, 'R_e4_c1': 109, 'R_e4_c3': 101, 'R_e4_c2': 85, 'R_e5_c4': 73})


In [25]:
#/usr/users/vogg/monkey-tracking-in-the-wild/src/data/

# Alpha: val (A_e2_c4, A_e5_c4, A_e3_c3, A_e6_c2)
# B: val (B_e6_c2, B_e3_c1, B_e3_c4, B_e4_c1)
# J: val (J_e1_c2, J_e3_c2, J_e4_c4, J_e5_c4)
# R1: val (R_e1_c3, R_e2_c3, R_e6_c4, R_e5_c2)

for img_id in label_list:
    if not img_id.startswith(".") and not any(img_id.startswith(prefix) for prefix in ["R_e1_c3", "R_e2_c3", "R_e6_c4", "R_e5_c2"]):
        label_fpath = data_root / "lemur_ids_R1.train"

        label_str = data_root.name + "/images/" + img_id.replace("txt", "png") + "\n"

        with open(label_fpath, 'a') as f:
            f.write(label_str)

In [85]:
import random

def sample_with_min_distance(numbers, sample_size, min_distance):
    selected = []
    remaining = set(numbers)
    
    while len(selected) < sample_size and remaining:
        # Randomly choose from the remaining numbers
        candidate = random.choice(list(remaining))
        selected.append(candidate)
        
        # Remove numbers within the restricted distance of the chosen number
        remaining = {x for x in remaining if abs(x - candidate) >= min_distance}
    
    if len(selected) < sample_size:
        raise ValueError("Not enough numbers to satisfy the distance constraint.")
    
    return selected

# Example usage
numbers = list(range(1000))
sample_size = 158
min_distance = 5

result = sample_with_min_distance(numbers, sample_size, min_distance)
print(result)


ValueError: Not enough numbers to satisfy the distance constraint.