In [1]:
import pandas as pd
import os

In [2]:
def split_and_save_file(filepath, output_root_dir, split_size=1000, filename_prefix="subfile"):
    """
    Split the file at 'filepath' into smaller files of 'split_size' rows each,
    save them in directories within 'output_root_dir'.
    
    Args:
    - filepath: Path to the input file.
    - output_root_dir: The root directory where subdirectories will be created.
    - split_size: Number of rows per split. Default is 1000.
    - filename_prefix: Prefix for the output file names.
    """
    
    count = 0
    
    data = pd.read_table(filepath, sep='\t')
    print('Read the file...')
    
    num_splits = len(data) // split_size + (1 if len(data) % split_size != 0 else 0)
    
    if not os.path.exists(output_root_dir):
        os.makedirs(output_root_dir)
        print(f"Created root directory: {output_root_dir}")
    
    for i in range(num_splits):
        count += 1
        start_idx = i * split_size
        end_idx   = min((i + 1) * split_size, len(data))
        sub_data  = data.iloc[start_idx:end_idx]
        
        dir_name  = os.path.join(output_root_dir, f"{filename_prefix}_dir_{i + 1}")
        file_name = f"{filename_prefix}_from_{start_idx}_to_{end_idx - 1}.txt"
        full_path = os.path.join(dir_name, file_name)
        
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        
        sub_data.to_csv(full_path, sep='\t', index=False)
        
        if (count % 100 == 0):
            print(f"File saved: {full_path}")

In [3]:
filepath        = 'data/chembl/chembl_33_chemreps.txt'  
output_root_dir = './input'  

In [4]:
data = pd.read_table(filepath, sep='\t')

In [5]:
len(data)

2372674

In [6]:
%%time

split_and_save_file(filepath, output_root_dir, split_size=1187, filename_prefix="chembl_split")

Read the file...
Created root directory: ./input
File saved: ./input/chembl_split_dir_100/chembl_split_from_117513_to_118699.txt
File saved: ./input/chembl_split_dir_200/chembl_split_from_236213_to_237399.txt
File saved: ./input/chembl_split_dir_300/chembl_split_from_354913_to_356099.txt
File saved: ./input/chembl_split_dir_400/chembl_split_from_473613_to_474799.txt
File saved: ./input/chembl_split_dir_500/chembl_split_from_592313_to_593499.txt
File saved: ./input/chembl_split_dir_600/chembl_split_from_711013_to_712199.txt
File saved: ./input/chembl_split_dir_700/chembl_split_from_829713_to_830899.txt
File saved: ./input/chembl_split_dir_800/chembl_split_from_948413_to_949599.txt
File saved: ./input/chembl_split_dir_900/chembl_split_from_1067113_to_1068299.txt
File saved: ./input/chembl_split_dir_1000/chembl_split_from_1185813_to_1186999.txt
File saved: ./input/chembl_split_dir_1100/chembl_split_from_1304513_to_1305699.txt
File saved: ./input/chembl_split_dir_1200/chembl_split_from_142