In [None]:
import pandas as pd
import numpy as np
import random
import os

# **Test Train Dev Split**

In [None]:
# Load your data from an Excel file
input_file =  'Dataset\\excels\\filtered_file.xlsx'  # Replace with your input file name
df = pd.read_excel(input_file, header=None)

# Manually set the expected columns
expected_columns = ["ID", "Signer", "Sequence", "Phrase", "ID_Duplicate"]
df.columns = expected_columns

# Function to split data based on unique entries
def split_data(df):
    df['Prefix'] = df['ID'].apply(lambda x: x.split('_')[0])
    unique_prefixes = df['Prefix'].unique()
    random.shuffle(unique_prefixes)
    
    dev_set = []
    test_set = []
    train_set = []
    
    dev_prefixes = set()
    test_prefixes = set()
    
    for prefix in unique_prefixes:
        prefix_data = df[df['Prefix'] == prefix]
        prefix_entries = prefix_data.sample(frac=1).values.tolist()  # Shuffle the entries for each prefix
        
        if len(prefix_entries) > 0:
            dev_set.append(prefix_entries.pop(0))
            dev_prefixes.add(prefix)
        
        if len(prefix_entries) > 0:
            test_set.append(prefix_entries.pop(0))
            test_prefixes.add(prefix)
        
        train_set.extend(prefix_entries)
    
    dev_df = pd.DataFrame(dev_set, columns=df.columns)
    test_df = pd.DataFrame(test_set, columns=df.columns)
    train_df = pd.DataFrame(train_set, columns=df.columns)
    
    return dev_df, test_df, train_df

# Split the data
dev_df, test_df, train_df = split_data(df)

# Save to Excel files
dev_df.to_excel('Dataset\\excels\\dev.xlsx', index=False)
test_df.to_excel('Dataset\\excels\\test.xlsx', index=False)
train_df.to_excel('Dataset\\excels\\train.xlsx', index=False)

print("Files have been created successfully.")

In [None]:
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Dataset\\excels\\train.xlsx')  # Replace with your file path
df = pd.read_excel(file_path)

# Shuffle the rows of the DataFrame
shuffled_df = df.sample(frac=1).reset_index(drop=True)

# Split the DataFrame into 20 roughly equal parts
num_files = 20
split_dfs = np.array_split(shuffled_df, num_files)

# Get the directory of the input file
output_dir = os.path.dirname(file_path)

# Save each part into a separate Excel file in the same directory as the input file
for i, split_df in enumerate(split_dfs):
    output_path = os.path.join(output_dir, f'split_file_{i+1}.xlsx')
    split_df.to_excel(output_path, index=False)