In [1]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split
from PIL import Image

In [2]:
# Function to copy files from source to destination directory
def write_partition_data_file(data, destination_directory, txt_file_path):
    os.makedirs("data/", exist_ok=True)
    with open(txt_file_path, 'w') as txt_file:
        for _, row in data.iterrows():
            sample_path = "input_data/" + row['sample_path']
            label_id = row['label_id']
            label_directory = os.path.join(destination_directory, str(label_id))
            
            # Create label directory if it doesn't exist
            os.makedirs(label_directory, exist_ok=True)
            
            img = Image.open(sample_path)
            img = img.resize((28, 28))
            # Copy the file to the label directory
            img.save(sample_path)
            shutil.copy(sample_path, label_directory)
            
            # Write relative file path to the text file
            relative_path = os.path.join(label_directory, os.path.basename(sample_path))
            relative_path = relative_path.replace("data/", "")
            txt_file.write(f"{relative_path}\n")

In [3]:
# Read the CSV file into a pandas DataFrame
csv_file_path = 'input_data/femnist.csv'
df = pd.read_csv(csv_file_path)

In [4]:
# Filter samples for client_id = 6
clients = df['client_id'].unique()

In [5]:
# Define paths for train and test directories
train_directory = 'data/train'
test_directory = 'data/test'

for client_id in clients:
    client_data = df[df['client_id'] == client_id]
    train_data, test_data = train_test_split(client_data, test_size=0.3, random_state=42)
    train_txt_file_path = f'data/partition_{client_id}_train.txt'
    test_txt_file_path = f'data/partition_{client_id}_test.txt'
    write_partition_data_file(train_data, train_directory, train_txt_file_path)
    write_partition_data_file(test_data, test_directory, test_txt_file_path)