In [7]:
import pandas as pd
import numpy as np

filepath = '../coords/old10s-60f.csv'

df = pd.read_csv(filepath)

df.columns

Index(['frame_index', 'ball', 'gk0', 'gk1', 'id_1', 'id_2', 'id_3', 'id_4',
       'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12',
       'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_22',
       'id_23', 'id_24', 'id_28', 'id_29', 'id_30', 'id_34', 'id_36', 'id_39',
       'id_43', 'id_44', 'id_53', 'id_61', 'id_82', 'id_83', 'id_99', 'id_105',
       'id_117', 'id_137', 'id_146', 'id_147', 'id_149', 'id_151', 'id_157',
       'id_162', 'id_163', 'id_164', 'id_165', 'id_168', 'id_169', 'id_170',
       'id_171', 'id_173', 'id_174', 'id_182', 'id_183', 'id_185', 'id_186',
       'id_188', 'id_193', 'id_196', 'id_199', 'id_200', 'id_201', 'id_204',
       'id_205'],
      dtype='object')

Create good sequences of frames

In [9]:
# Exclude the frame index column (assume it's the first column)
data_cols = df.columns[1:]

# Count non-empty (non-NaN) cells per row
df['detection_count'] = df[data_cols].notnull().sum(axis=1)

# Find indices of good frames
good_mask = df['detection_count'] > 18

# Find all consecutive sequences of good frames (no min length)
sequences = []
start = None
for idx, is_good in enumerate(good_mask):
    if is_good:
        if start is None:
            start = idx
    else:
        if start is not None:
            sequences.append((start, idx))
            start = None
# Handle sequence that goes to the end
if start is not None:
    sequences.append((start, len(df)))

# Save each sequence as a CSV
import os
os.makedirs('subsets', exist_ok=True)
for i, (start, end) in enumerate(sequences):
    subset = df.iloc[start:end]
    subset.to_csv(f'subsets/subset{i+1}.csv', index=False)

Work on best subsets

In [11]:
subset = 'subsets/subset1.csv'

df = pd.read_csv(subset)

#drop empty columns
df = df.dropna(axis=1, how='all')

#drop extra columns
df = df.drop(columns=['detection_count'])

#save cleaned subset
df.to_csv(subset, index=False)

df.columns

Index(['frame_index', 'ball', 'gk0', 'gk1', 'id_1', 'id_2', 'id_3', 'id_4',
       'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12',
       'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_22',
       'id_23', 'id_24', 'id_28', 'id_29', 'id_30', 'id_34', 'id_36', 'id_39',
       'id_43', 'id_44', 'id_53'],
      dtype='object')

Manually detect outliers and transition of ids per frame

In [None]:
#Subset 2 - 20 frames
# ids: 1,3,4,5,9,10,12,13,14,15,16,19 are fine
# id6 lt f16 - no change
# id22 lt f17 - no change

#id 24 - referee -> delete cell
df = df.drop(columns=['id_24'])
#id 23 is goalkeeper -> drop gk0 -> rename to 'gk0' 
df = df.drop(columns=['gk0'])
df = df.rename(columns={'id_23': 'gk0'})
#id 11 is referee -> delete column
df = df.drop(columns=['id_11'])

# id2 lt f14 -> id43
df.loc[14:, 'id_2'] = df.loc[14:, 'id_43']
df = df.drop(columns=['id_43'])

# id7 lt f3-6 -> id29
df.loc[7:, 'id_7'] = df.loc[7:, 'id_29']
df = df.drop(columns=['id_29'])

# id8 lt f9 -> id34
df.loc[9:, 'id_8'] = df.loc[9:, 'id_34']
df = df.drop(columns=['id_34'])

# id17 lt f5-6 -> id30 lt f8-12 -> id 39
df.loc[6:, 'id_17'] = df.loc[6:, 'id_30']
df.loc[12:, 'id_17'] = df.loc[12:, 'id_39']
df = df.drop(columns=['id_30', 'id_39'])

# id18 1stframe lt f1 -> id18 lt f9 -> id36 lt f12-15 -> id44
df.loc[9:, 'id_18'] = df.loc[9:, 'id_36']
df.loc[15:, 'id_18'] = df.loc[15:, 'id_44']
df = df.drop(columns=['id_36', 'id_44'])

# id18 2ndframe lt f2-5 -> id28
df.loc[df['frame_index'] == 2, 'id_28'] = df.loc[df['frame_index'] == 2, 'id_18']
df.loc[df['frame_index'] == 2, 'id_18'] = np.nan


In [None]:
#save cleaned subset
df.to_csv('subsets/subset2_cleaned.csv', index=False)