In [None]:
import os
import pandas as pd

data_dir = "../data/mtl-dataset"  # Directory containing the original datasets
small_data_dir = "../data/small-data"  # Directory for smaller datasets

train_suffix = ".task.train"
test_suffix = ".task.test"

train_sample_size = 100
test_sample_size = 20

# Create the small data directory if it doesn't exist
os.makedirs(small_data_dir, exist_ok=True)


for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)
    if not os.path.isfile(filepath):  # Skip if not a file (e.g., subdirectory)
        continue

    if filename.endswith(train_suffix):
        try:
            df = pd.read_csv(filepath, sep='\t', header=None, encoding='utf-8')  # Read TSV, no header
            small_df = df.sample(n=train_sample_size, random_state=42)  # Randomly sample 100 rows
            small_filepath = os.path.join(small_data_dir, filename)
            small_df.to_csv(small_filepath, sep='\t', header=False, index=False)  # Save as TSV, no header/index
            print(f"Created small training dataset: {small_filepath}")
        except pd.errors.EmptyDataError: #handle empty files
            print(f"Skipping empty file: {filepath}")
        except Exception as e:
             print(f"An error occurred processing {filename}: {e}")

    elif filename.endswith(test_suffix):
         try:
            df = pd.read_csv(filepath, sep='\t', header=None)
            small_df = df.sample(n=test_sample_size, random_state=42)
            small_filepath = os.path.join(small_data_dir, filename)
            small_df.to_csv(small_filepath, sep='\t', header=False, index=False)
            print(f"Created small test dataset: {small_filepath}")
         except pd.errors.EmptyDataError:
            print(f"Skipping empty file: {filepath}")
         except Exception as e:
             print(f"An error occurred processing {filename}: {e}")

Created small test dataset: ../data/small-data/music.task.test
Created small test dataset: ../data/small-data/health_personal_care.task.test
Created small test dataset: ../data/small-data/apparel.task.test
Created small test dataset: ../data/small-data/imdb.task.test
Created small test dataset: ../data/small-data/camera_photo.task.test
Created small training dataset: ../data/small-data/toys_games.task.train
Created small test dataset: ../data/small-data/video.task.test
Created small training dataset: ../data/small-data/kitchen_housewares.task.train
Created small test dataset: ../data/small-data/books.task.test
Created small training dataset: ../data/small-data/baby.task.train
Created small test dataset: ../data/small-data/electronics.task.test
Created small training dataset: ../data/small-data/magazines.task.train
Created small training dataset: ../data/small-data/video.task.train
Created small test dataset: ../data/small-data/dvd.task.test
Created small test dataset: ../data/small-dat