# --- Notebook Setup ---

In [1]:
from datetime import datetime
import pytz
print('LOGGING TIME OF START:',  datetime.strftime(datetime.now(pytz.timezone('Asia/Singapore')), "%Y-%m-%d %H:%M:%S"))


try:
    import Bio
except:
    pass
    #for drfold2 --------
    #!pip install biopython
    #!pip install /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

print('PIP INSTALL OK !!!!')

LOGGING TIME OF START: 2025-04-28 18:41:45
PIP INSTALL OK !!!!


In [None]:
import os,sys

import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.expand_frame_repr', False)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from timeit import default_timer as timer
import re
from pathlib import Path
import matplotlib 
import matplotlib.pyplot as plt

print('IMPORT OK!!!')

# --- Constants ---

In [None]:

PDB_DIR = Path("/kaggle/input/rna-folding-top-data")  # Directory where all .pdb files are stored
TEST_SEQ_FILE = "/kaggle/input/stanford-rna-3d-folding/test_sequences.csv"

PDB_FILES = [
    "/kaggle/input/rna-folding-top-data/R1261TS481_2-vfold-0.786.pdb",
    "/kaggle/input/rna-folding-top-data/R1262TS481_2-vfold-0.791.pdb",
    "/kaggle/input/rna-folding-top-data/R1263TS481_1-vfold-0.977.pdb",
    "/kaggle/input/rna-folding-top-data/R1264TS481_1-vfold-0.887.pdb",
    "/kaggle/input/rna-folding-top-data/R1291TS481_1-vfold-0.834.pdb"
]

SUBMISSION_FILE = "/kaggle/working/submission.csv"

In [None]:
# --- Load Test Sequences ---
test_seqs = pd.read_csv(TEST_SEQ_FILE)

# --- Expand Test Sequences to per-residue Rows ---
expanded_rows = []
for idx, row in test_seqs.iterrows():
    target_id = row['target_id']
    sequence = row['sequence']
    for i, base in enumerate(sequence):
        expanded_rows.append({
            'ID': f"{target_id}_{i+1}",
            'target_id': target_id,
            'resname': base,
            'resid': i+1
        })

In [None]:
test_df = pd.DataFrame(expanded_rows)

# --- PDB Parsing Functions ---
def extract_c1prime_coords_with_resid(pdb_path):
    coords = {}
    with open(pdb_path, 'r') as f:
        for line in f:
            if line.startswith("ATOM") and line[12:16].strip() == "C1'":
                resid = int(line[22:26])
                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])
                coords[resid] = [x, y, z]
    return coords

# --- Load Coordinates from PDB Models ---
model_coords = []
for pdb_file in PDB_FILES:
    coords = extract_c1prime_coords_with_resid(pdb_file)
    model_coords.append(coords)

# --- Align Models to Expanded Test Data ---
num_residues = len(test_df)
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'resname': test_df['resname'],
    'resid': test_df['resid']
})

In [None]:
for model_idx, coords_dict in enumerate(model_coords):
    x_list, y_list, z_list = [], [], []
    for _, row in test_df.iterrows():
        resid = row['resid']
        coord = coords_dict.get(resid)
        if coord is None:
            # Padding if missing
            if len(x_list) > 0:
                coord = [x_list[-1], y_list[-1], z_list[-1]]
            else:
                coord = [0.0, 0.0, 0.0]
        x_list.append(coord[0])
        y_list.append(coord[1])
        z_list.append(coord[2])

    submission[f'x_{model_idx+1}'] = x_list
    submission[f'y_{model_idx+1}'] = y_list
    submission[f'z_{model_idx+1}'] = z_list

# --- Save Submission ---
submission.to_csv(SUBMISSION_FILE, index=False)

print(f"✅ Final Submission file generated: {SUBMISSION_FILE}")

✅ Final Submission file generated: /kaggle/working/submission.csv
