In [None]:
import pandas as pd
import os

def filter_and_recount_sentences(input_tsv_path: str, output_tsv_path: str):
    """
    Filters a TSV file based on the word count of its first column.

    It removes rows where the word count in the first column is less than 6
    or greater than 55 and saves the result to a new file.

    Args:
        input_tsv_path (str): The path to the input TSV file.
        output_tsv_path (str): The path where the filtered TSV file will be saved.
    """
    try:
        # Read the input TSV file.
        df = pd.read_csv(input_tsv_path, sep='\t')

        # Check if the DataFrame has any columns.
        if df.columns.empty:
            print(f"Warning: The file {input_tsv_path} is empty or has no columns. Skipping.")
            return

        # --- MODIFIED: Automatically use the first column for filtering ---
        column_to_filter = df.columns[0]
        # --- End of modification ---

        # Calculate the word count for the first column.
        # This handles potential non-string or NaN values by converting to string,
        # filling NaNs with an empty string, and then splitting.
        df['calculated_word_count'] = df[column_to_filter].astype(str).fillna('').str.split().str.len()

        # Filter the DataFrame based on the calculated word count.
        initial_rows = len(df)
        filtered_df = df[(df['calculated_word_count'] >= 6) & (df['calculated_word_count'] <= 55)]
        final_rows = len(filtered_df)

        # Save the filtered DataFrame to the output file, dropping the temporary column.
        filtered_df.drop(columns=['calculated_word_count']).to_csv(output_tsv_path, sep='\t', index=False)

        print(f"Filtering complete based on the first column: '{column_to_filter}'. ✅")
        print(f"Initial number of sentences: {initial_rows}")
        print(f"Number of sentences removed: {initial_rows - final_rows}")
        print(f"Final number of sentences: {final_rows}")
        print(f"Filtered data saved to: {output_tsv_path}")

    except FileNotFoundError:
        print(f"Error: The file at {input_tsv_path} was not found.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {input_tsv_path}: {e}")


def process_directory(input_dir: str, output_dir: str):
    """
    Recursively processes all tab-separated files in a directory structure.

    For each file, it applies the filter_and_recount_sentences function and
    recreates the same directory structure in the specified output directory.
    The filtering is always based on the word count of the first column in each file.

    Args:
        input_dir (str): The path to the parent input directory.
        output_dir (str): The path to the parent output directory where the
                          filtered structure will be saved.
    """
    print(f"Starting to process directory: {input_dir}")
    # os.walk traverses the directory tree top-down.
    for dirpath, _, filenames in os.walk(input_dir):
        # Create a corresponding directory structure in the output directory.
        relative_path = os.path.relpath(dirpath, input_dir)
        output_subdir = os.path.join(output_dir, relative_path)
        os.makedirs(output_subdir, exist_ok=True)

        for filename in filenames:
            # Construct the full path for input and output files.
            input_file = os.path.join(dirpath, filename)
            output_file = os.path.join(output_subdir, filename)

            print(f"\nProcessing file: {input_file}")
            # Apply the filtering function to the current file.
            filter_and_recount_sentences(input_file, output_file)

    print("\nDirectory processing complete.")


if __name__ == '__main__':
    # --- USAGE INSTRUCTIONS ---
    # 1. Replace "path/to/your/input_data" with the actual path to your directory.
    # 2. Replace "path/to/your/output_data" with where you want to save the filtered results.
    INPUT_DIRECTORY = "Domain_Wise_Arranged_Parallel"
    OUTPUT_DIRECTORY = "test_filtered2_copy"

    # --- Code for dummy data creation (Commented out) ---
    # # Create the main input directory.
    # os.makedirs(INPUT_DIRECTORY, exist_ok=True)
    
    # # Create a subdirectory inside the input directory.
    # os.makedirs(os.path.join(INPUT_DIRECTORY, "en-fr"), exist_ok=True)

    # # Create a sample TSV file in the main directory.
    # data1 = {
    #     'english_sentence': [
    #         "This is a short sentence.",  # 5 words, should be removed
    #         "This is a perfectly acceptable sentence for our use case.",  # 10 words
    #         "One two three four five six.", # 6 words
    #         "This sentence is just right." # 5 words, should be removed
    #     ],
    #     'score': [0.9, 0.95, 0.88, 0.92]
    # }
    # df1 = pd.DataFrame(data1)
    # df1.to_csv(os.path.join(INPUT_DIRECTORY, "data_file_1.tsv"), sep='\t', index=False)

    # # Create another sample TSV file in the subdirectory.
    # data2 = {
    #     'french_version': [
    #         "Ceci est une phrase parfaite.", # 5 words, should be removed
    #         "Un deux trois quatre cinq six sept huit neuf dix.", # 10 words
    #         "Cette phrase contient exactement six mots pour le test.", # 9 words
    #         "trop court" # 2 words, should be removed
    #     ],
    #     'quality_metric': [0.85, 0.91, 0.89, 0.7]
    # }
    # df2 = pd.DataFrame(data2)
    # df2.to_csv(os.path.join(INPUT_DIRECTORY, "en-fr", "data_file_2.tsv"), sep='\t', index=False)

    # print("--- Created dummy files and directories for testing. ---")
    
    # Run the processing function.
    if os.path.isdir(INPUT_DIRECTORY):
        process_directory(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
        print("\n--- Script finished. ---")
        print(f"Check the '{OUTPUT_DIRECTORY}' folder for the results.")
    else:
        print(f"Error: Input directory not found at '{INPUT_DIRECTORY}'")
        print("Please update the INPUT_DIRECTORY variable before running the script.")



Error: Input directory not found at 'path/to/your/input_data'
Please update the INPUT_DIRECTORY variable before running the script.
