In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# Load the original dataset
original_data_path = '2-27_synds_100_sub_60_qce.csv'
data = pd.read_csv(original_data_path)

# Group by 'subset' to handle each subset separately
grouped = data.groupby('subset')

# List to collect new generated data
new_data = []

# Generate new subsets
num_new_subsets = 900  # number of new subsets to create
existing_subsets = data['subset'].unique()
num_existing_subsets = len(existing_subsets)

for i in range(1, num_new_subsets + 1):
    # Choose random subsets to mix
    selected_subsets = np.random.choice(existing_subsets, size=2, replace=False)
    mixed_subset = pd.concat([grouped.get_group(x).sample(frac=1, replace=False).reset_index(drop=True) for x in selected_subsets])
    
    # Shuffle the mixed subset
    mixed_subset = shuffle(mixed_subset).reset_index(drop=True)
    
    # Assign new subset number
    mixed_subset['subset'] = i + num_existing_subsets  # adjust based on your indexing needs
    
    # Append to the new data list
    new_data.append(mixed_subset)

# Concatenate all new data
expanded_data = pd.concat(new_data, ignore_index=True)

# Concatenate the original data with the new expanded data
final_data = pd.concat([data, expanded_data], ignore_index=True)

# Correcting subset IDs so they range from 1 to 1000
final_data['subset'] = (final_data.index // 26) + 1

# Save the new expanded dataset
expanded_file_path = 'expanded_2-27_synds_100_sub_60_qce.csv'
final_data.to_csv(expanded_file_path, index=False)

# Check for correctness: ensure 1000 subsets, each with 26 items
check_correctness = final_data['subset'].value_counts().unique()

expanded_file_path, check_correctness


('expanded_2-27_synds_100_sub_60_qce.csv', array([26]))