In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import Dropdown, interactive, widgets, Output
from scipy.interpolate import make_interp_spline

In [95]:
# Define the column headers\n",
headers = ['frame', 'classid','id', 'x1', 'y1', 'width', 'height','a','b','c','d']

# Provide the path to your CSV file\n",
csv_file_path = 'D:/MELA/ANNOTATION ANALYSIS/Edited_Edited_Edited_20230309_SM_Lek1_P2D4_DJI_0771.CSV'
# Read the CSV file into a DataFrame and assign the headers\n",
df_in = pd.read_csv(csv_file_path, header=None, names=headers)
# Display the DataFrame\n",
print(df_in)

        frame  classid  id    x1    y1  width  height  a  b  c  d
0        1200        0   0   371   580     51      52 -1 -1 -1 -1
1        1200        0  30  2846  1763     53      48 -1 -1 -1 -1
2        1200        1   2  2690  1725     63      36 -1 -1 -1 -1
3        1200        1   3  3417  1968     50      51 -1 -1 -1 -1
4        1200        1   4  3571  2023     39      51 -1 -1 -1 -1
...       ...      ...  ..   ...   ...    ...     ... .. .. .. ..
233194   5127        0  28  5116  2194     40      34 -1 -1 -1 -1
233195   5127        0  53  2812   244     27      29 -1 -1 -1 -1
233196   5127        0   9  4353   689     37      40 -1 -1 -1 -1
233197   5127        0  30  2895  1679     31      22 -1 -1 -1 -1
233198   5127        1  29  3182   929     35      24 -1 -1 -1 -1

[233199 rows x 11 columns]


In [96]:
df_in['area'] = df_in['width'] * df_in['height']

In [97]:
unique_individuals = df_in[['classid', 'id']].drop_duplicates()

# Calculate the total number of unique individuals
total_individuals = len(unique_individuals)
total_boxes = len(df_in)

# Print the total number of unique individuals
print("Total Number of Unique Individuals (Male and Female):", total_individuals)
print("Total Number of bounding boxes:", total_boxes)

Total Number of Unique Individuals (Male and Female): 87
Total Number of bounding boxes: 233199


In [98]:
#Finding classid error

# Create a new column 'classid_error' and initialize it with 0
df_in['classid_error'] = 0

# Function to update 'classid_error' column based on 'classid' column
def update_classid_error(row):
    if row['classid'] == -1:
        return 1
    else:
        return row['classid_error']

# Apply the update_classid_error function to each row
df_in['classid_error'] = df_in.apply(update_classid_error, axis=1)

df_in['classid_error_frame'] = 0

# Find frames with 'classid_error' entry of 1
frames_with_classid_error = df_in[df_in['classid_error'] == 1]['frame'].unique()

# Update 'classid_error_frame' column for the identified frames
df_in.loc[df_in['frame'].isin(frames_with_classid_error), 'classid_error_frame'] = 1

# Print frames with duplicates
print("Frames with classid errors:", frames_with_classid_error)



Frames with classid errors: []


In [39]:
#Cleaning classid error
# Define a threshold for closeness when comparing coordinates and area
threshold = 20  # Adjust this value as needed
area_threshold = 500
# Delete the entries in the 'classid' and 'id' columns of frames with classid errors
for frame in frames_with_classid_error:
    df_in.loc[(df_in['frame'] == frame) & (df_in['classid_error'] == 1), ['classid', 'id']] = None


# Iterate through frames with classid error
for frame in frames_with_classid_error:
    # Get the rows for the current frame with classid error
    error_frame_rows = df_in[(df_in['frame'] == frame) & (df_in['classid_error'] == 1)]

    # Find the previous frame
    previous_frame = df_in[df_in['frame'] == frame - 1]

    # Iterate through the rows with classid error in the current frame
    for index, row in error_frame_rows.iterrows():
        x = row['x1']
        y = row['y1']
        area = row['area']

        # Iterate through the rows of the previous frame
        for prev_index, prev_row in previous_frame.iterrows():
            prev_x = prev_row['x1']
            prev_y = prev_row['y1']
            prev_area = prev_row['area']
            prev_classid = prev_row['classid']
            prev_individual_id = prev_row['id']

            # Check if the coordinates and area are close enough
            if abs(x - prev_x) < threshold and abs(y - prev_y) < threshold and abs(area - prev_area) < area_threshold:
                # Fill the missing entries in the current row from the previous row
                df_in.at[index, 'classid'] = prev_classid
                df_in.at[index, 'id'] = prev_individual_id


In [99]:
#Finding duplicate frames

# Create a new column 'duplicates' indicating if a row is a duplicate
df_in['duplicates'] = df_in.groupby(['frame', 'classid'])['id'].transform(lambda x: x.duplicated(keep=False).astype(int))

# Find unique frames with duplicates
frames_with_duplicates = df_in.loc[df_in['duplicates'] == 1, 'frame'].unique()

# Create 'duplicate_frame' column and set values based on 'frame' and 'frames_with_duplicates'
df_in['duplicate_frame'] = df_in['frame'].apply(lambda x: 1 if x in frames_with_duplicates else 0)

# Print frames with duplicates
print("Frames with Duplicates:", frames_with_duplicates)

# Display the updated DataFrame
#print(df_in)

Frames with Duplicates: []


In [41]:
#remove dupliactes based on the area difference
import pandas as pd

# Read the DataFrame from your CSV file or use the existing DataFrame df_in
# df_in = pd.read_csv('your_csv_file.csv')

# Step 1: Find unique frames with duplicates
frames_with_duplicates = df_in.loc[df_in['duplicates'] == 1, 'frame'].unique()

# Step 2: Create a list to store the corresponding unique entries of the frame column
unique_frames = []

# Iterate through frames_with_duplicates and append unique entries to unique_frames list
for frame in frames_with_duplicates:
    unique_frame = df_in.loc[(df_in['frame'] == frame) & (df_in['duplicates'] == 1), 'frame'].iloc[0]
    unique_frames.append(unique_frame)

# Step 3: Remove duplicates with smaller area for each unique frame and id combination
for frame in unique_frames:
    frame_data = df_in[df_in['frame'] == frame]
    prev_frame = frame - 1
    
    for classid, individual_id in frame_data.groupby(['classid', 'id']):
        duplicate_rows = individual_id[individual_id['duplicates'] == 1]
        if len(duplicate_rows) > 1:
            min_area_difference = float('inf')
            min_area_duplicate_index = None
            
            for index, duplicate_row in duplicate_rows.iterrows():
                id_to_compare = duplicate_row['id']
                prev_frame_entry = df_in[(df_in['frame'] == prev_frame) & (df_in['id'] == id_to_compare)]
                
                if not prev_frame_entry.empty:
                    area_difference = abs(duplicate_row['area'] - prev_frame_entry['area'].values[0])
                    if area_difference < min_area_difference:
                        min_area_difference = area_difference
                        min_area_duplicate_index = index
            
            # Drop the duplicate row with the larger area
            duplicate_rows_to_drop = duplicate_rows[duplicate_rows.index != min_area_duplicate_index]
            df_in.drop(duplicate_rows_to_drop.index, inplace=True)





In [101]:
df_in.sort_values(by='frame', inplace=True)
file_name_without_extension = csv_file_path.split('/')[-1].split('.')[0]

In [102]:
# SAVING EDITED FILES AS CSV
columns_to_drop = ['duplicates', 'duplicate_frame', 'classid_error', 'classid_error_frame','area']


# Drop the specified columns
df_in = df_in.drop(columns=columns_to_drop)


# Save DataFrame to CSV without headers
df_in.to_csv(f'Edited_{file_name_without_extension}.csv', index=False, header=False)
