In [3]:
import pandas as pd
import numpy as np

# Read the labels.csv file
labels_df = pd.read_csv('/home/sandyap/DM project/labels.csv')

# Separate the data by feature
facial_features_df = labels_df[['Filename', 'Recognizable-Facial-Feature']]
brain_voxels_df = labels_df[['Filename', 'Brain-Feature-Loss']]

# Get the number of samples for each class in each feature
facial_features_counts = facial_features_df['Recognizable-Facial-Feature'].value_counts()
brain_voxels_counts = brain_voxels_df['Brain-Feature-Loss'].value_counts()

# Determine which class has fewer samples for each feature
facial_features_minority_class = facial_features_counts.idxmin()
brain_voxels_minority_class = brain_voxels_counts.idxmin()

# Determine the number of samples in the minority class for each feature
facial_features_min_count = facial_features_counts.min()
brain_voxels_min_count = brain_voxels_counts.min()

# Sample the larger group to get the same number of samples as the smaller group for each feature
facial_features_majority_class_df = facial_features_df[facial_features_df['Recognizable-Facial-Feature'] != facial_features_minority_class]
facial_features_balanced_df = pd.concat([facial_features_df[facial_features_df['Recognizable-Facial-Feature'] == facial_features_minority_class], facial_features_majority_class_df.sample(facial_features_min_count)], axis=0)

brain_voxels_majority_class_df = brain_voxels_df[brain_voxels_df['Brain-Feature-Loss'] != brain_voxels_minority_class]
brain_voxels_balanced_df = pd.concat([brain_voxels_df[brain_voxels_df['Brain-Feature-Loss'] == brain_voxels_minority_class], brain_voxels_majority_class_df.sample(brain_voxels_min_count)], axis=0)

# Merge the two balanced features and shuffle the data
balanced_df = pd.merge(facial_features_balanced_df, brain_voxels_balanced_df, on='Filename', how='inner')
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

balanced_df.to_csv('balanced_labels.csv', index=False)