# Data Preprocessing

### Process mentioned in paper

we apply several filtering steps such as excluding
structures with only one atom per nucleotide, removing protein
residues from RNA structures, extracting contiguous sequences
from experimental structures to address mismatches between the
provided FASTA sequences and the experimental 3D structures
for preserving base pairing integrity, and excluding sequences
lacking any base pairs in their corresponding native structures.
Finally, we only retain sequences with a minimum length of 30
and a maximum length of 200 to ensure efficient training. This
results in a clean training set consisting of 573 RNA sequences
and 52 non-redundant test sequences (excluding component
0) for benchmarking our method development

The model require "fasta file and 3 base pair maps" as inputs

Dataset - RNA3DB (mmcif files)
Base pair maps extracted from RNAview, MCannotate, DSSR

In [None]:
conda activate RNAbpFlow

In [5]:
#Get Dataset
wget https://github.com/marcellszi/rna3db/releases/download/incremental-update/rna3db-mmcifs.v2.tar.xz
tar -xzf rna3db-mmcifs.v2.tar.xz
#unzip and remove component 0 folder from test set

## Remove structures with only one atom per nucleotide

Filter a set of files, removing any file containing at least one nucleotide represented by only a single atom.  Retain only those structures where every nucleotide is represented by more than one atom.

In [1]:
import os
import re

def has_any_single_atom_nucleotide(file_path):
    try:
        with open(file_path, 'r') as f:
            content = f.read()
        
        # Extract the atom_site loop section
        atom_section_match = re.search(r"loop_\s+_atom_site\..*?(?=\n#|\Z)", content, re.DOTALL)
        if not atom_section_match:
            return False  # No atom section found
        
        atom_section = atom_section_match.group(0)
        
        # Get all atom lines
        atom_lines = re.findall(r"ATOM\s+\d+.*", atom_section)
        
        # Count atoms per residue
        residue_atom_count = {}
        for line in atom_lines:
            parts = line.split()
            if len(parts) >= 20:  # Ensure we have enough columns
                res_id = parts[8]  # label_seq_id
                res_name = parts[5]  # label_comp_id
                res_key = f"{res_name}_{res_id}"
                
                if res_key in residue_atom_count:
                    residue_atom_count[res_key] += 1
                else:
                    residue_atom_count[res_key] = 1
        
        # Check if at least one residue has exactly one atom
        return any(count == 1 for count in residue_atom_count.values())
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

def remove_files_with_any_single_atom_nucleotide(root_folder):
    removed_files = 0

    # Walk through all directories and subdirectories
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.cif'):
                file_path = os.path.join(dirpath, filename)
                if has_any_single_atom_nucleotide(file_path):
                    try:
                        os.remove(file_path)
                        removed_files += 1
                        print(f"Removed: {file_path}")
                    except Exception as e:
                        print(f"Error removing {file_path}: {e}")

    print(f"Total files removed: {removed_files}")

# Paths to the root folders
test_folder = r'C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set'
train_folder = r'C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set'

# Remove files from both test and train sets
remove_files_with_any_single_atom_nucleotide(test_folder)
remove_files_with_any_single_atom_nucleotide(train_folder)


Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_109\4v5z_BN\4v5z_BN.cif
Total files removed: 1
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1e8s_C\1e8s_C.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1emi_B\1emi_B.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1ob5_B\1qzb_B.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1ob5_B\3ep2_Y.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1ob5_B\3eq3_Y.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1qrt_B\1gsg_T.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1qzc_A\1qzc_A.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1r2x_C\1r2w_C.cif
Re

## Remove NON-RNA sequences

Parse through all files and eliminate those containing any non-RNA residues, such as protein residues or unknown values.  Retain only files consisting exclusively of RNA residues.

In [10]:
import os
from Bio.PDB import MMCIFParser

# Define standard RNA nucleotide residues
RNA_NUCLEOTIDES = {"A", "U", "G", "C"}  

def has_non_rna_residues(mmcif_file):
    """Check if an mmCIF file contains residues that are NOT RNA nucleotides."""
    parser = MMCIFParser(QUIET=True)

    try:
        structure = parser.get_structure("Structure", mmcif_file)
        for model in structure:
            for chain in model:
                for residue in chain:
                    res_name = residue.get_resname().strip()
                    if res_name not in RNA_NUCLEOTIDES:
                        return True  # Found a non-RNA residue
    except Exception as e:
        print(f"Error processing {mmcif_file}: {e}")

    return False  # All residues are RNA nucleotides

def remove_non_rna_mmcif_files(root_folder):
    """Recursively scan and remove mmCIF files containing non-RNA residues."""
    removed_files = 0

    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".cif"):
                filepath = os.path.join(dirpath, filename)
                if has_non_rna_residues(filepath):
                    try:
                        os.remove(filepath)
                        removed_files += 1
                        print(f"Removed: {filepath}")
                    except Exception as e:
                        print(f"Error removing {filepath}: {e}")

    print(f"Total non-RNA mmCIF files removed: {removed_files}")

# Paths to the root folders
test_folder = r"C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set"
train_folder = r"C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set"

# Remove non-RNA mmCIF files from both test and train sets
remove_non_rna_mmcif_files(test_folder)
remove_non_rna_mmcif_files(train_folder)


Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_114\8bvh_A\8bvh_A.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_116\8dej_N\8dej_N.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_124\7k16_P\7k16_P.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_48\8svf_I\8svf_I.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_48\8svf_J\8svf_J.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_53\7oqc_I\7oqc_I.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_53\7oqe_I\7oqe_I.cif
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_81\7uwh_B\7uwh_B.cif
Total non-RNA mmCIF files removed: 8
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set\component_1\1ob5_B

## Remove files outside length range (30 to 200)

Parse through all files, get sequence length and eliminate files that doesn't fall under the range (30 to 200)

In [11]:
import os
import re

def get_sequence_length(file_path):
    """Extracts the sequence length from an mmCIF file."""
    try:
        with open(file_path, 'r') as f:
            content = f.read()

        # Look for the one-letter sequence code
        seq_match = re.search(r"_entity_poly\.pdbx_seq_one_letter_code_can\s+(\S+)", content)
        if seq_match:
            sequence = seq_match.group(1)
            return len(sequence)

        # If not found, try to count from entity_poly_seq loop
        poly_seq_match = re.search(r"loop_\s+_entity_poly_seq\.entity_id\s+_entity_poly_seq\.num\s+_entity_poly_seq\.mon_id\s+_entity_poly_seq\.heter(.*?)(?=\n#|\Z)", content, re.DOTALL)
        if poly_seq_match:
            poly_seq_section = poly_seq_match.group(1)
            seq_lines = re.findall(r"\d+\s+\d+\s+\w+\s+\w+", poly_seq_section)
            return len(seq_lines)

        return None  # Couldn't determine sequence length

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def remove_files_outside_sequence_range(root_folder, min_length=30, max_length=200):
    """Scans and removes mmCIF files with sequence lengths outside the range."""
    removed_files = 0
    skipped_files = 0
    total_files = 0

    # Walk through all directories and subdirectories
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.cif'):
                total_files += 1
                file_path = os.path.join(dirpath, filename)
                sequence_length = get_sequence_length(file_path)

                if sequence_length is None:
                    skipped_files += 1
                    continue

                if sequence_length < min_length or sequence_length > max_length:
                    try:
                        os.remove(file_path)
                        removed_files += 1
                        print(f"Removed: {file_path} (Length: {sequence_length})")
                    except Exception as e:
                        print(f"Error removing {file_path}: {e}")

                # Print progress every 100 files
                if total_files % 100 == 0:
                    print(f"Processed {total_files} files...")

    print(f"Total .cif files processed: {total_files}")
    print(f"Files removed due to sequence length outside range: {removed_files}")
    print(f"Files skipped (couldn't determine length): {skipped_files}")

# Paths to the test and train folders
test_folder = r'C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set'
train_folder = r'C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set'

# Remove unwanted files from both folders
remove_files_outside_sequence_range(test_folder)
remove_files_outside_sequence_range(train_folder)


Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_103\8toc_R\8toc_R.cif (Length: 4269)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_107\7qgr_x\7qgh_x.cif (Length: 692)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_107\7qgr_x\7qgr_x.cif (Length: 692)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_125\8bu8_A\8bu8_A.cif (Length: 354)
Processed 100 files...
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_35\5lqw_9\5lqw_9.cif (Length: 572)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_35\6j6h_B\6j6g_B.cif (Length: 679)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_35\6j6h_B\6j6h_B.cif (Length: 679)
Removed: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_35\6j6h_B\7oqb_I.cif (Length: 3

## Remove non-contagious sequences

Parse through all files and check for sequence interruptions as indicated by residue numbering. Remove any file where such an interruption is detected. Retain only files containing uninterrupted sequences.

In [14]:
import os
from Bio.PDB import MMCIFParser

# Define standard RNA nucleotides
RNA_NUCLEOTIDES = {"A", "U", "G", "C"}

def has_non_contiguous_rna_sequences(mmcif_file):
    """Check if an mmCIF file contains non-contiguous RNA sequences."""
    parser = MMCIFParser(QUIET=True)
    
    try:
        structure = parser.get_structure("RNA", mmcif_file)
        
        for model in structure:
            for chain in model:
                previous_residue_id = None
                
                for residue in chain:
                    res_name = residue.get_resname().strip()
                    res_id = residue.get_id()[1]  # Extract residue number
                    
                    if res_name in RNA_NUCLEOTIDES:
                        if previous_residue_id is not None and res_id != previous_residue_id + 1:
                            # Found a non-contiguous sequence
                            return True
                        
                        previous_residue_id = res_id
    
    except Exception as e:
        print(f"Error processing {mmcif_file}: {e}")
        return False  # Treat errors as files to ignore
    
    return False  # File has only contiguous sequences

def remove_files_with_non_contiguous_rna(root_folder):
    """Delete mmCIF files containing non-contiguous RNA sequences."""
    removed_files = 0
    total_files = 0
    
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".cif"):
                total_files += 1
                file_path = os.path.join(dirpath, filename)
                
                if has_non_contiguous_rna_sequences(file_path):
                    try:
                        os.remove(file_path)
                        removed_files += 1
                        print(f"Deleted: {file_path}")
                    except Exception as e:
                        print(f"Error deleting {file_path}: {e}")

                # Print progress every 100 files
                if total_files % 100 == 0:
                    print(f"Processed {total_files} files...")
    
    print(f"Total .cif files processed: {total_files}")
    print(f"Files removed due to non-contiguous RNA sequences: {removed_files}")

# Paths to test and train folders
test_folder = r"C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set"
train_folder = r"C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\train_set"

# Process and remove files from both folders
remove_files_with_non_contiguous_rna(test_folder)
remove_files_with_non_contiguous_rna(train_folder)


Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_101\4v2s_Q\4v2s_Q.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_102\8uta_N\8uiw_N.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_102\8uta_N\8uta_N.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_117\8e8m_R\8e74_R.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_117\8e8m_R\8e8m_R.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_120\4rum_A\4rum_A.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_126\6gov_R\6gov_R.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_29\6jdv_B\6jdv_B.cif
Deleted: C:\Users\nikhi\Desktop\RNA\RNAbpFlow\complex\rna3db-mmcifs\test_set\component_29\8hj4_B\6jdq_B.cif
Deleted: C:\Users\nik

## RNAVIEW Base Pairing

In [None]:
#Get RNAview and set path to access on terminal
#Refer to RNAview github for installation
#RNAview : https://github.com/rcsb/RNAView
wget https://github.com/rcsb/RNAView/archive/refs/tags/RNAView-v2.0.0.tar.gz
zcat  RNAVIEW.tar.gz | tar xvf -
cd /?/?/?/RNAVIEW/
make
RNAVIEW=/?/?/?/RNA/RNAVIEW; export RNAVIEW
PATH="/?/?/?/RNAVIEW/bin:"$PATH; export PATH


In [1]:
import sys
import os

# Get the directory where rnaview is located 
rnaview_dir = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/rnaview"  

if rnaview_dir not in sys.path:
    sys.path.append(rnaview_dir)

!rnaview -h


Usage: rnaview -f filename 
--------------------------------------------------------------
        Options of the rnaview program
+-------------------------------------------------------------+
| (1) If no [option] is given, it only generate the fully     |
|     annotated base pair lists.                              |
|     Example:    rnaview  --pdb pdbfile_name                 |
|                                                             |
| (2) [option] -p to generate fully annotated 2D structure in |
|     postscript format. Detailed information is given in XML |
|     format(RNAML)                                           |
|     Example:    rnaview  -p --pdb pdbfile_name              |
|                                                             |
| (3) [option] -v to generate a 3D structure in VRML format.  |
|     It can be displayed on internet (with VRML plug in).    |
|     Example:    rnaview  -v --pdb pdbfile_name              |
|                   

### Total cif files remaining before base pairing

In [2]:
import os

def count_cif_files(root_folder):
    """Counts all .cif files in the specified folder and its subdirectories."""
    cif_count = 0
    
    for dirpath, _, filenames in os.walk(root_folder):
        cif_count += sum(1 for filename in filenames if filename.endswith(".cif"))
    
    print(f"Total .cif files in {root_folder}: {cif_count}")
    return cif_count

# Define root folders
test_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set"
train_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set"

# Count CIF files in both folders
test_count = count_cif_files(test_folder)
train_count = count_cif_files(train_folder)

total_count = test_count + train_count
print(f"Total .cif files in all folders: {total_count}")


Total .cif files in /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set: 230
Total .cif files in /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set: 4916
Total .cif files in all folders: 5146


### Base pairing using RNAview and remove files with no base pairs

Process all CIF files by executing the RNAview command, generating multiple output files for each (filename.cif.out, filename.cif.ps, filename.cif.xml, filename.cif_sort.cif, filename.cif_patt.out, filename.cif_torsion.out, filename.cif.tmp.pdb). Parse each corresponding "filename.cif.out" file. If the base pair count within this file is zero, delete the original CIF file and all associated RNAview output files. Otherwise, retain the CIF file and all generated output files.

In [3]:
import subprocess
import os
import multiprocessing
from concurrent.futures import ThreadPoolExecutor

def run_rnaview(mmcif_file):
    """Runs rnaview on an mmCIF file and returns the generated .cif.out file."""
    command = f"rnaview -p --cif --label {mmcif_file}"
    subprocess.run(command, shell=True, capture_output=True, text=True)
    
    # Generate the expected .cif.out filename
    out_file = f"{mmcif_file}.out"
    return mmcif_file, out_file

def has_zero_base_pairs(out_file):
    """Checks if the output file contains 'The total base pairs = 0'."""
    try:
        with open(out_file, 'r') as f:
            content = f.read()
            return 'The total base pairs =   0' in content
    except FileNotFoundError:
        return False

def delete_files(file_path):
    """Deletes the given CIF file and all its associated output files."""
    base_dir = os.path.dirname(file_path)
    base_name = os.path.basename(file_path).replace(".cif", "")
    
    try:
        os.remove(file_path)
        deleted_files = [file_path]
        
        # Find and delete all associated output files
        for file in os.listdir(base_dir):
            if file.startswith(base_name) and file.endswith(".out"):
                file_to_delete = os.path.join(base_dir, file)
                os.remove(file_to_delete)
                deleted_files.append(file_to_delete)
        
        print(f"Deleted: {', '.join(deleted_files)}")
        return 1
    except FileNotFoundError as e:
        print(f"Error: {e}. File might have been moved or already deleted.")
        return 0

def process_and_delete(root_folder):
    """Checks mmCIF files, deletes those whose .cif.out file indicates 0 base pairs."""
    cif_files = []
    
    # Collect all .cif files first
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".cif"):
                cif_files.append(os.path.join(dirpath, filename))
    
    total_checked = len(cif_files)
    total_deleted = 0
    
    # Use multiprocessing to speed up processing
    with multiprocessing.Pool(processes=os.cpu_count()) as pool:
        results = pool.map(run_rnaview, cif_files)
    
    # Filter files with zero base pairs
    files_to_delete = [file_path for file_path, out_file in results if has_zero_base_pairs(out_file)]
    
    # Use ThreadPoolExecutor for efficient file deletion
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        deletion_results = executor.map(delete_files, files_to_delete)
        total_deleted = sum(deletion_results)
    
    print(f"\nSummary for {root_folder}:")
    print(f"Total CIF files checked: {total_checked}")
    print(f"Total CIF files deleted: {total_deleted}\n")

# Define root folders
test_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set"
train_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set"

# Process both test and train folders
process_and_delete(test_folder)
process_and_delete(train_folder)


Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_108/8x5d_O/8x5d_O.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_108/8x5d_O/8x5d_O.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_108/8x5d_O/8x5d_O.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_113/6yl5_L/6yl5_L.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_113/6yl5_L/6yl5_L.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_113/6yl5_L/6yl5_L.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_118/7cxm_J/7cxm_J.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_118/7cxm_J/7cxm_J.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_118/7cxm_J/7cxm_J

Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_67/5gip_H/5gip_Q.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_67/5gip_H/5gip_Q.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_67/5gip_H/5gip_Q.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_89/7n2c_mR/7n2u_mR.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_89/7n2c_mR/7n2u_mR.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_89/7n2c_mR/7n2u_mR.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_94/7okx_P/7oky_P.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_94/7okx_P/7oky_P.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set/component_94/7okx_P/7oky_P.ci

Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_15/3aoh_Q/3aoi_Q.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_15/3aoh_Q/3aoi_Q.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_15/3aoh_Q/3aoi_Q.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_F.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_F.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_F.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_C.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_C.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_16/8y6o_J/3siv_C

Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_I/8urb_I.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_I/8urb_I.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_I/8urb_I.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_9/5xlp_K/5xlp_K.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_9/5xlp_K/5xlp_K.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_9/5xlp_K/5xlp_K.cif_torsion.out
Deleted: /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_J/8urb_J.cif, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_J/8urb_J.cif.out, /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set/component_8/8urb_J/8urb_J.cif_tors

### Total cif files remaining after base pairing

In [4]:
import os

def count_cif_files(root_folder):
    """Counts all .cif files in the specified folder and its subdirectories."""
    cif_count = 0
    
    for dirpath, _, filenames in os.walk(root_folder):
        cif_count += sum(1 for filename in filenames if filename.endswith(".cif"))
    
    print(f"Total .cif files in {root_folder}: {cif_count}")
    return cif_count

# Define root folders
test_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set"
train_folder = "/mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set"

# Count CIF files in both folders
test_count = count_cif_files(test_folder)
train_count = count_cif_files(train_folder)

total_count = test_count + train_count
print(f"Total .cif files in all folders: {total_count}")


Total .cif files in /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/test_set: 169
Total .cif files in /mnt/c/Users/nikhi/Desktop/RNA/RNAbpFlow/complex/rna3db-mmcifs/train_set: 4794
Total .cif files in all folders: 4963


With the new outputs we can generate base pair maps as ".npy" files. RNAview base pairing map is just 1 of 3 maps .            
To Download and access "DSSR", It is required to request access from [https://inventions.techventures.columbia.edu/technologies/dssr-an-integrated--CU20391/licenses/188].    
"MCannotate" - [https://github.com/major-lab/MC-Annotate]