In [3]:
import numpy as np
import pandas as pd

train = pd.read_csv('../data/raw/train.csv')

# First, let's see what columns we actually have
print('Columns in train.csv:')
print(train.columns.tolist())
print('\nTrain shape:', train.shape)
print('\nFirst few rows:')
train.head()

Columns in train.csv:
['eeg_id    ', ' eeg_sub_id', ' eeg_label_offset_seconds', ' spectrogram_id', ' spectrogram_sub_id', ' spectrogram_label_offset_seconds', ' label_id  ', ' patient_id', ' expert_consensus', ' seizure_vote', ' lpd_vote', ' gpd_vote', ' lrda_vote', ' grda_vote', ' other_vote']

Train shape: (106800, 15)

First few rows:


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [5]:
# Clean up column names (remove leading/trailing spaces)
train.columns = train.columns.str.strip()

print('Cleaned columns:')
print(train.columns.tolist())

# Now drop duplicates based on eeg_id and all vote columns
df_uniq = train.drop_duplicates(
    subset=['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote'])

print('\nOriginal train shape:', train.shape)
print('After dropping duplicates:', df_uniq.shape)
print(f'Removed {train.shape[0] - df_uniq.shape[0]} duplicate rows')

df_uniq.head(5)
df_uniq.to_csv('../data/raw/train_unique.csv')

Cleaned columns:
['eeg_id', 'eeg_sub_id', 'eeg_label_offset_seconds', 'spectrogram_id', 'spectrogram_sub_id', 'spectrogram_label_offset_seconds', 'label_id', 'patient_id', 'expert_consensus', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

Original train shape: (106800, 15)
After dropping duplicates: (20183, 15)
Removed 86617 duplicate rows


In [7]:
# Let's investigate the relationship between eeg_id and spectrogram_id
print("=== Investigating EEG vs Spectrogram duplicates ===\n")

# Check if same eeg_id always has same spectrogram_id
grouped = train.groupby('eeg_id')['spectrogram_id'].nunique()
print(f"EEG IDs with multiple spectrograms: {(grouped > 1).sum()}")
print(f"Total unique EEG IDs: {grouped.shape[0]}")

# Check the reverse
grouped_spec = train.groupby('spectrogram_id')['eeg_id'].nunique()
print(f"\nSpectrogram IDs with multiple EEGs: {(grouped_spec > 1).sum()}")
print(f"Total unique Spectrogram IDs: {grouped_spec.shape[0]}")

# Check if eeg_id and spectrogram_id are always paired the same way
print("\n=== Checking (eeg_id, spectrogram_id) pairing ===")
unique_pairs = train[['eeg_id', 'spectrogram_id']].drop_duplicates()
print(f"Unique (eeg_id, spectrogram_id) pairs: {len(unique_pairs)}")
print(f"Unique eeg_ids: {train['eeg_id'].nunique()}")
print(f"Unique spectrogram_ids: {train['spectrogram_id'].nunique()}")

# Are they 1:1 mapped?
if len(unique_pairs) == train['eeg_id'].nunique() == train['spectrogram_id'].nunique():
    print("\n✓ EEG and Spectrogram IDs are 1:1 mapped (always paired together)")
else:
    print("\n⚠ EEG and Spectrogram IDs are NOT 1:1 mapped!")

=== Investigating EEG vs Spectrogram duplicates ===

EEG IDs with multiple spectrograms: 0
Total unique EEG IDs: 17089

Spectrogram IDs with multiple EEGs: 2380
Total unique Spectrogram IDs: 11138

=== Checking (eeg_id, spectrogram_id) pairing ===
Unique (eeg_id, spectrogram_id) pairs: 17089
Unique eeg_ids: 17089
Unique spectrogram_ids: 11138

⚠ EEG and Spectrogram IDs are NOT 1:1 mapped!


In [8]:
# This means: Each EEG has exactly ONE spectrogram
# But some spectrograms are paired with MULTIPLE different EEGs
# Let's verify this further

print("=== Analysis of the mapping ===\n")

# Each eeg_id appears with exactly one spectrogram_id
eeg_spec_map = train.groupby('eeg_id')['spectrogram_id'].apply(list)
print(f"Max spectrograms per EEG: {eeg_spec_map.apply(lambda x: len(set(x))).max()}")

# But multiple eeg_ids can share the same spectrogram_id
spec_eeg_map = train.groupby('spectrogram_id')['eeg_id'].apply(lambda x: list(set(x)))
specs_with_multiple_eegs = spec_eeg_map[spec_eeg_map.apply(len) > 1]
print(f"Spectrograms shared by multiple EEGs: {len(specs_with_multiple_eegs)}")

# Show an example
if len(specs_with_multiple_eegs) > 0:
    example_spec = specs_with_multiple_eegs.index[0]
    example_eegs = specs_with_multiple_eegs.iloc[0]
    print(f"\nExample: Spectrogram {example_spec} is paired with {len(example_eegs)} different EEGs:")
    print(f"  EEG IDs: {example_eegs[:5]}...")  # Show first 5
    
print("\n=== So what does this mean for deduplication? ===")
print("Since each EEG has exactly ONE spectrogram paired with it,")
print("deduplicating by eeg_id automatically handles the spectrogram!")
print("\n✓ Your current approach (dedup by eeg_id + votes) is CORRECT")

=== Analysis of the mapping ===

Max spectrograms per EEG: 1
Spectrograms shared by multiple EEGs: 2380

Example: Spectrogram 1219001 is paired with 3 different EEGs:
  EEG IDs: [2175806584, 2529955608, 1626798710]...

=== So what does this mean for deduplication? ===
Since each EEG has exactly ONE spectrogram paired with it,
deduplicating by eeg_id automatically handles the spectrogram!

✓ Your current approach (dedup by eeg_id + votes) is CORRECT


In [10]:
# Final verification: Check the deduplicated dataset
print("=== Deduplicated Dataset Statistics ===\n")
print(f"Unique patients in df_uniq: {df_uniq['patient_id'].nunique()}")
print(f"Unique EEG IDs in df_uniq: {df_uniq['eeg_id'].nunique()}")
print(f"Unique Spectrogram IDs in df_uniq: {df_uniq['spectrogram_id'].nunique()}")
print(f"Total rows in df_uniq: {len(df_uniq)}")

# Check if any eeg_id appears more than once (shouldn't, given our dedup logic)
eeg_counts = df_uniq['eeg_id'].value_counts()
max_eeg_count = eeg_counts.max()
print(f"\nMax occurrences of any eeg_id: {max_eeg_count}")

if max_eeg_count == 1:
    print("✓ Each EEG appears exactly once - perfect!")
else:
    print(f"⚠ Some EEGs appear multiple times (max: {max_eeg_count})")
    print("This could happen if same EEG has different vote patterns")
    
# Check the vote combinations for duplicated eeg_ids (if any)
dup_eegs = eeg_counts[eeg_counts > 1]
if len(dup_eegs) > 0:
    print(f"\nEEG IDs appearing multiple times: {len(dup_eegs)}")
    example_eeg = dup_eegs.index[0]
    print(f"\nExample - EEG {example_eeg} appears {dup_eegs.iloc[0]} times:")
    display(df_uniq[df_uniq['eeg_id'] == example_eeg][['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']])

=== Deduplicated Dataset Statistics ===

Unique patients in df_uniq: 1950
Unique EEG IDs in df_uniq: 17089
Unique Spectrogram IDs in df_uniq: 11138
Total rows in df_uniq: 20183

Max occurrences of any eeg_id: 43
⚠ Some EEGs appear multiple times (max: 43)
This could happen if same EEG has different vote patterns

EEG IDs appearing multiple times: 1807

Example - EEG 188361788 appears 43 times:


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
77694,188361788,3,1,0,0,0,0
77695,188361788,2,1,0,0,0,0
77696,188361788,0,2,0,0,0,0
77698,188361788,0,2,2,0,0,0
77704,188361788,0,2,4,0,0,0
77707,188361788,1,2,0,0,0,0
77710,188361788,0,12,4,0,0,0
77711,188361788,0,2,1,0,0,0
77712,188361788,0,0,1,0,0,1
77713,188361788,0,1,1,0,0,0


⚠️ However, the data shows something interesting:

Same eeg_id can have multiple different vote patterns (different expert annotations!)
Example: EEG 188361788 appears 43 times with different vote combinations
This means different experts annotated the same EEG recording differently
What this means:
Your deduplication is keeping all unique vote patterns for each EEG, which is actually the right thing to do because: