In [1]:
import pandas as pd
import numpy as np

# Parameters
input_csv = "train_keypoints.csv"  # Input CSV file
label_column_index = 0  # Index of the label column

# Read the dataset without header
df = pd.read_csv(input_csv, header=None)

# Use the first column as the label
label_col = df.columns[label_column_index]

# Count the samples in each class
group_counts = df[label_col].value_counts()
min_count = group_counts.min()
max_count = group_counts.max()
mean_count = int(round(group_counts.mean()))

print("Class counts:")
print(group_counts)
print(f"Minimum: {min_count}, Maximum: {max_count}, Mean: {mean_count}")

# 1. Undersampling
undersampled = (
    df.groupby(label_col, group_keys=False)
    .apply(lambda x: x.sample(n=min_count, random_state=42))
    .reset_index(drop=True)
)
undersampled.to_csv("normalized_cut_names.csv", index=False, header=False)
print("Saved undersampled data to 'normalized_cut.csv'")

# 2. Oversampling
oversampled = (
    df.groupby(label_col, group_keys=False)
    .apply(lambda x: x.sample(n=max_count, replace=True, random_state=42))
    .reset_index(drop=True)
)
oversampled.to_csv("normalized_upscale_names.csv", index=False, header=False)
print("Saved oversampled data to 'normalized_upscale.csv'")

# 3. Mean sampling (middle ground)
mean_sampled = (
    df.groupby(label_col, group_keys=False)
    .apply(lambda x: x.sample(n=mean_count, replace=(len(x) < mean_count), random_state=42))
    .reset_index(drop=True)
)
mean_sampled.to_csv("normalized_mean_names.csv", index=False, header=False)
print("Saved mean-normalized data to 'normalized_mean.csv'")


Class counts:
0
2     425
0     302
4     268
3     249
6     231
11    225
7     178
12    174
1     167
10    160
8     134
5     111
9      88
Name: count, dtype: int64
Minimum: 88, Maximum: 425, Mean: 209
Saved undersampled data to 'normalized_cut.csv'


  .apply(lambda x: x.sample(n=min_count, random_state=42))
  .apply(lambda x: x.sample(n=max_count, replace=True, random_state=42))


Saved oversampled data to 'normalized_upscale.csv'
Saved mean-normalized data to 'normalized_mean.csv'


  .apply(lambda x: x.sample(n=mean_count, replace=(len(x) < mean_count), random_state=42))
