In [9]:
import os
import pandas as pd

def get_protein_names(protein_folder):
    """
    Get the names of protein files without their .pdb extension.

    Args:
    - protein_folder: Folder containing protein files.

    Returns:
    - List of protein names without the .pdb extension.
    """
    protein_files = os.listdir(protein_folder)
    protein_names = [os.path.splitext(file)[0] for file in protein_files if file.endswith('.pdb')]
    print(protein_names)
    return protein_names

def split_and_save_file(filepath, output_root_dir, protein_folder, split_size=1000, filename_prefix="subfile"):
    """
    Split the file at 'filepath' into smaller files of 'split_size' rows each,
    save them in directories within 'output_root_dir'. Also create subdirectories
    for each protein in 'protein_folder'.

    Args:
    - filepath: Path to the input file.
    - output_root_dir: The root directory where subdirectories will be created.
    - protein_folder: Folder containing protein files.
    - split_size: Number of rows per split. Default is 1000.
    - filename_prefix: Prefix for the output file names.
    """

    # Get protein names
    proteins = get_protein_names(protein_folder)
    print('Protein names:', proteins)

    count = 0
    data = pd.read_table(filepath, sep='\t')
    print('Read the file...')

    num_splits = len(data) // split_size + (1 if len(data) % split_size != 0 else 0)

    if not os.path.exists(output_root_dir):
        os.makedirs(output_root_dir)
        print(f"Created root directory: {output_root_dir}")

    for i in range(num_splits):
        count += 1
        start_idx = i * split_size
        end_idx = min((i + 1) * split_size, len(data))
        sub_data = data.iloc[start_idx:end_idx]

        # Create subdirectory for split files
        split_dir_name = os.path.join(output_root_dir, f"{filename_prefix}_dir_{i + 1}")
        if not os.path.exists(split_dir_name):
            os.makedirs(split_dir_name)
            print(f"Created directory: {split_dir_name}")
            # Create subdirectories for each protein within the split directory
            for protein in proteins:
                protein_dir = os.path.join(split_dir_name, protein)
                if not os.path.exists(protein_dir):
                    os.makedirs(protein_dir)
                    print(f"Created protein subdirectory: {protein_dir}")

        file_name = f"{filename_prefix}_from_{start_idx}_to_{end_idx - 1}.txt"
        full_path = os.path.join(split_dir_name, file_name)
        sub_data.to_csv(full_path, sep='\t', index=False)
        
        if (count % 100 == 0):
            print(f"File saved: {full_path}")

In [4]:
filepath        = 'data/chembl_33_chemreps.txt'  
output_root_dir = './input'  

In [5]:
data = pd.read_table(filepath, sep='\t')

In [7]:
len(data)

2372674

In [11]:
%%time

split_and_save_file(filepath, output_root_dir, './proteins', split_size=1187, filename_prefix="chembl_split")

['apol1', 'slc6a19', 'adcy5']
Protein names: ['apol1', 'slc6a19', 'adcy5']
Read the file...
Created root directory: ./input
Created directory: ./input/chembl_split_dir_1
Created protein subdirectory: ./input/chembl_split_dir_1/apol1
Created protein subdirectory: ./input/chembl_split_dir_1/slc6a19
Created protein subdirectory: ./input/chembl_split_dir_1/adcy5
Created directory: ./input/chembl_split_dir_2
Created protein subdirectory: ./input/chembl_split_dir_2/apol1
Created protein subdirectory: ./input/chembl_split_dir_2/slc6a19
Created protein subdirectory: ./input/chembl_split_dir_2/adcy5
Created directory: ./input/chembl_split_dir_3
Created protein subdirectory: ./input/chembl_split_dir_3/apol1
Created protein subdirectory: ./input/chembl_split_dir_3/slc6a19
Created protein subdirectory: ./input/chembl_split_dir_3/adcy5
Created directory: ./input/chembl_split_dir_4
Created protein subdirectory: ./input/chembl_split_dir_4/apol1
Created protein subdirectory: ./input/chembl_split_dir_4