In [1]:
import os
import pandas as pd
import numpy as np

def split_tsv_train_test(data_dir, train_dir, test_dir, train_ratio=0.8):
    """
    Splits TSV files in a directory into training and test sets, maintaining class proportions.

    Args:
        data_dir: Path to the directory containing the original TSV files.
        train_dir: Path to the directory to store the training TSV files.
        test_dir: Path to the directory to store the test TSV files.
        train_ratio: The proportion of data to use for training (default: 0.8).
    """
    os.makedirs(train_dir, exist_ok=True)  # Create directories if they don't exist
    os.makedirs(test_dir, exist_ok=True)


    for filename in os.listdir(data_dir):
        if os.path.isfile(os.path.join(data_dir, filename)): #only files, not subdirectories
            filepath = os.path.join(data_dir, filename)
            try:
                df = pd.read_csv(filepath, sep='\t', header=None, names=['sentiment','text'])

                # Separate positive and negative examples:
                df_pos = df[df['sentiment'] == 1]
                df_neg = df[df['sentiment'] == 0]

                # Calculate the number of samples for each set:
                train_pos_size = int(len(df_pos) * train_ratio)
                test_pos_size = len(df_pos) - train_pos_size
                train_neg_size = int(len(df_neg) * train_ratio)
                test_neg_size = len(df_neg) - train_neg_size


                # Sample training and test sets while preserving class distribution:
                train_pos = df_pos.sample(n=train_pos_size)
                test_pos = df_pos.drop(train_pos.index)
                train_neg = df_neg.sample(n=train_neg_size)
                test_neg = df_neg.drop(train_neg.index)


                # Concatenate positive and negative examples for training and test:
                df_train = pd.concat([train_pos, train_neg])
                df_test = pd.concat([test_pos, test_neg])

                # Save the training and test sets to TSV files:
                train_filepath = os.path.join(train_dir, filename + ".task.train") #add the .train suffix
                test_filepath = os.path.join(test_dir, filename + ".task.test") #add the .test suffix
                df_train.to_csv(train_filepath, sep='\t', header=False, index=False)
                df_test.to_csv(test_filepath, sep='\t', header=False, index=False)
                print(f"Split {filename} into training ({train_filepath}) and test ({test_filepath}) sets.")


            except pd.errors.EmptyDataError:  # Handle empty files
                print(f"Skipping empty file: {filepath}")
            except Exception as e:
                print(f"An error occurred processing {filename}: {e}")




# Example usage:
data_dir = "/home/shadi2/phd/code/anbera-univ/large-arabic-sentiment-analysis-resouces/preprocess_urls_hashtags"
train_dir = "../data/arabic/all"
test_dir = "../data/arabic/all"

split_tsv_train_test(data_dir, train_dir, test_dir)

Split tead into training (../data/arabic/all/tead.task.train) and test (../data/arabic/all/tead.task.test) sets.
Split tsac into training (../data/arabic/all/tsac.task.train) and test (../data/arabic/all/tsac.task.test) sets.
Split att into training (../data/arabic/all/att.task.train) and test (../data/arabic/all/att.task.test) sets.
Split labr into training (../data/arabic/all/labr.task.train) and test (../data/arabic/all/labr.task.test) sets.
Split res1 into training (../data/arabic/all/res1.task.train) and test (../data/arabic/all/res1.task.test) sets.
Split prod into training (../data/arabic/all/prod.task.train) and test (../data/arabic/all/prod.task.test) sets.
Split hard into training (../data/arabic/all/hard.task.train) and test (../data/arabic/all/hard.task.test) sets.
Split astd into training (../data/arabic/all/astd.task.train) and test (../data/arabic/all/astd.task.test) sets.
Split arsas into training (../data/arabic/all/arsas.task.train) and test (../data/arabic/all/arsas.