In [2]:
import pandas as pd

train_df = pd.read_csv('cats-and-dogs-plus-plus/train.csv')
test_df = pd.read_csv('cats-and-dogs-plus-plus/test.csv')

print("Train CSV shape:", train_df.shape)
print("Test CSV shape:", test_df.shape)

category_columns = ['Cat', 'Dog', 'Moris', 'Motya', 'Biatrix']

category_counts = {}
for col in category_columns:
    category_counts[col] = train_df[col].sum()

all_zeros_mask = (train_df[category_columns] == 0).all(axis=1)
if all_zeros_mask.sum() > 0:
    category_counts['Other'] = all_zeros_mask.sum()

print("\nCategory distribution:")
for category, count in category_counts.items():
    percentage = (count / len(train_df)) * 100
    print(f"{category}: {count} ({percentage:.1f}%)")

print(f"\nTotal samples: {len(train_df)}")

Train CSV shape: (12260, 6)
Test CSV shape: (18390, 1)

Category distribution:
Cat: 5077 (41.4%)
Dog: 5025 (41.0%)
Moris: 56 (0.5%)
Motya: 42 (0.3%)
Biatrix: 41 (0.3%)
Other: 2162 (17.6%)

Total samples: 12260


In [9]:
def get_label_combination(row):
    active_labels = []
    for col in category_columns:
        if row[col] == 1:
            active_labels.append(col)
    
    if not active_labels:
        return "None (Other)"
    return " + ".join(sorted(active_labels))

train_df['label_combination'] = train_df[category_columns].apply(get_label_combination, axis=1)
combination_counts = train_df['label_combination'].value_counts()

for combination, count in combination_counts.items():
    percentage = (count / len(train_df)) * 100
    print(f"{combination:<25} {count:>5} ({percentage:>5.2f}%)")


Cat                        4989 (40.69%)
Dog                        4984 (40.65%)
None (Other)               2162 (17.63%)
Cat + Moris                  42 ( 0.34%)
Biatrix + Dog                37 ( 0.30%)
Cat + Motya                  32 ( 0.26%)
Cat + Moris + Motya          10 ( 0.08%)
Biatrix + Cat + Dog + Moris     4 ( 0.03%)
