In [1]:
import pandas as pd
import os
from datetime import timedelta

In [2]:
save_dir = r'data\ssusa'
os.makedirs(save_dir, exist_ok=True)

In [3]:
# Define the directory path
data_dir = save_dir

# Define file paths
sequences_path = data_dir + r'\ssusa_finalsequences.csv'

In [4]:
sequences_df = pd.read_csv(
    sequences_path,
    na_values=["NA", " "],
    dtype={
        "Year": "Int64",
        "Sequence_ID": "object",
        "Group_Size": "Int64",
    },
    keep_default_na=True,
    parse_dates=["Start_Time", "End_Time"]
)
print(f"Number of records in sequences_df: {len(sequences_df)}")

Number of records in sequences_df: 987979


In [5]:
sequences_df = sequences_df[sequences_df['Class'] == 'Mammalia']

In [6]:
# Columns to convert to Proper Case
prop_case_cols = [
    'Class', 'Order', 'Family', 'Genus', 'Species', 'Common_Name', 'Age', 'Sex'
]

# First, replace blank or whitespace-only strings with NaN for Age, Sex, and Group_Size
sequences_df[['Age', 'Sex', 'Group_Size']] = sequences_df[['Age', 'Sex', 'Group_Size']].replace(r'^\s*$', pd.NA, regex=True)

# Group_Size: Replace missing with 0 and convert to numeric
sequences_df['Group_Size'] = sequences_df['Group_Size'].fillna(0)
sequences_df['Group_Size'] = pd.to_numeric(sequences_df['Group_Size'], errors='coerce').fillna(0).astype(int)

# Age and Sex: Replace missing with 'Unknown'
sequences_df['Age'] = sequences_df['Age'].fillna('Unknown')
sequences_df['Sex'] = sequences_df['Sex'].fillna('Unknown')

# Now apply proper case to selected columns
for col in prop_case_cols:
    sequences_df[col] = sequences_df[col].str.lower()

In [7]:
def find_close_duplicates(group, minutes=5):
    idx = []
    interval = timedelta(minutes=minutes)
    for i in range(len(group)):
        start = group.iloc[i]['Start_Time']
        common_name = group.iloc[i]['Common_Name']
        # Only compare rows with the same Common_Name
        same_name = group['Common_Name'] == common_name
        mask = (
            (
                ((group['Start_Time'] >= start - interval) & (group['Start_Time'] <= start + interval)) |
                ((group['End_Time'] >= start - interval) & (group['End_Time'] <= start + interval))
            ) & same_name
        )
        # Exclude self
        mask.iloc[i] = False
        if mask.any():
            idx.append(group.index[i])
    return idx

In [8]:
# from datetime import timedelta

# # Sort by Deployment_ID and Start_Time
# sequences_df_sorted = sequences_df.sort_values(['Deployment_ID', 'Start_Time'])

# # For each row, check if there is another row within 5 minutes in the same Deployment_ID
# def find_close_duplicates(group):
#     idx = []
#     for i in range(len(group)):
#         start = group.iloc[i]['Start_Time']
#         # Check if any other row's Start_Time or End_Time is within 5 minutes
#         mask = (
#             ((group['Start_Time'] >= start - timedelta(minutes=5)) & (group['Start_Time'] <= start + timedelta(minutes=5))) |
#             ((group['End_Time'] >= start - timedelta(minutes=5)) & (group['End_Time'] <= start + timedelta(minutes=5)))
#         )
#         # Exclude self
#         mask.iloc[i] = False
#         if mask.any():
#             idx.append(group.index[i])
#     return idx

# close_dup_idx = []
# for _, group in sequences_df_sorted.groupby('Deployment_ID'):
#     close_dup_idx.extend(find_close_duplicates(group))

# close_duplicates = sequences_df.loc[close_dup_idx]
# print(f"Number of 5 mins duplicate = {len(close_duplicates)}")

In [9]:
intervals = [1, 5, 10, 15, 30, 60]  # in minutes
all_close_dup_idx = {}
all_close_duplicates = {}
sequences_df_sorted = sequences_df.sort_values(['Deployment_ID', 'Start_Time'])

for mins in intervals:
    close_dup_idx = []
    for _, group in sequences_df_sorted.groupby('Deployment_ID'):
        close_dup_idx.extend(find_close_duplicates(group, minutes=mins))
    all_close_dup_idx[mins] = close_dup_idx
    all_close_duplicates[mins] = sequences_df.loc[close_dup_idx]
    print(f"Interval: {mins} min, Number of duplicates: {len(close_dup_idx)}")
    display(all_close_duplicates[mins].head())

Interval: 1 min, Number of duplicates: 43707


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
607528,2022,Snapshot USA 2022,Afognak,AK_Forest_Afognak_22_C22-16,2978537,2022-09-10 02:12:12,2022-09-10 02:12:38,mammalia,lagomorpha,leporidae,lepus,americanus,snowshoe hare,unknown,unknown,1
571610,2022,Snapshot USA 2022,Afognak,AK_Forest_Afognak_22_C22-18 08/31/2022,2985790,2022-09-03 19:45:55,2022-09-03 19:45:57,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
569901,2022,Snapshot USA 2022,Afognak,AK_Forest_Afognak_22_C22-57 09/01/2022,2988518,2022-09-12 07:19:50,2022-09-12 07:20:35,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
3,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s4,2019-08-31 20:58:00,2019-08-31 20:58:00,mammalia,carnivora,ursidae,ursus,arctos,brown bear,adult,female,1
4,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s4,2019-08-31 20:58:00,2019-08-31 20:58:00,mammalia,carnivora,ursidae,ursus,arctos,brown bear,juvenile,unknown,2


Interval: 5 min, Number of duplicates: 241400


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978006,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477882,2023-10-13 19:05:18,2023-10-13 19:05:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978005,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477887,2023-10-19 20:32:13,2023-10-19 20:32:53,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,male,1


Interval: 10 min, Number of duplicates: 312867


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978006,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477882,2023-10-13 19:05:18,2023-10-13 19:05:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978005,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477887,2023-10-19 20:32:13,2023-10-19 20:32:53,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,male,1


Interval: 15 min, Number of duplicates: 353389


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978006,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477882,2023-10-13 19:05:18,2023-10-13 19:05:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978001,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477883,2023-10-13 19:17:59,2023-10-13 19:18:12,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1


Interval: 30 min, Number of duplicates: 423405


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978006,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477882,2023-10-13 19:05:18,2023-10-13 19:05:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978001,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477883,2023-10-13 19:17:59,2023-10-13 19:18:12,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1


Interval: 60 min, Number of duplicates: 494008


Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
977994,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477879,2023-10-12 14:29:58,2023-10-12 14:29:58,mammalia,carnivora,canidae,canis,latrans,coyote,unknown,unknown,1
977995,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477880,2023-10-12 15:23:05,2023-10-12 15:23:06,mammalia,carnivora,canidae,canis,latrans,coyote,unknown,unknown,1
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1


In [10]:
for mins, df in all_close_duplicates.items():
    out_path = os.path.join(data_dir, f"close_duplicates_{mins}min.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved {len(df)} records to {out_path}")

Saved 43707 records to data\ssusa\close_duplicates_1min.csv
Saved 241400 records to data\ssusa\close_duplicates_5min.csv
Saved 312867 records to data\ssusa\close_duplicates_10min.csv
Saved 353389 records to data\ssusa\close_duplicates_15min.csv
Saved 423405 records to data\ssusa\close_duplicates_30min.csv
Saved 494008 records to data\ssusa\close_duplicates_60min.csv


In [12]:
all_close_duplicates[30].head(50)

Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,Genus,Species,Common_Name,Age,Sex,Group_Size
977986,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477867,2023-10-15 15:53:51,2023-10-15 15:53:51,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,2
977987,2023,Snapshot USA 2023,CoteauRanch,7CoteauRanch 09/30/2023,8477868,2023-10-15 15:57:15,2023-10-15 15:57:15,mammalia,cetartiodactyla,bovidae,bos,taurus,domestic cattle,unknown,unknown,3
978000,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477881,2023-10-13 19:00:36,2023-10-13 19:00:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978006,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477882,2023-10-13 19:05:18,2023-10-13 19:05:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978001,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477883,2023-10-13 19:17:59,2023-10-13 19:18:12,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,female,1
978012,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477896,2023-10-13 19:35:24,2023-10-13 19:35:24,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978005,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477887,2023-10-19 20:32:13,2023-10-19 20:32:53,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,male,1
978008,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477888,2023-10-19 20:35:38,2023-10-19 20:35:40,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
978014,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477892,2023-11-02 06:22:04,2023-11-02 06:23:28,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,3
978010,2023,Snapshot USA 2023,CoteauRanch,8CoteauRanch 09/30/2023,8477894,2023-11-02 06:33:27,2023-11-02 06:33:41,mammalia,cetartiodactyla,cervidae,odocoileus,hemionus,mule deer,unknown,unknown,1
