In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import MinMaxScaler  # Using MinMaxScaler instead
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress warnings and info logs


# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load data
def load_data():
    train_sequences = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/train_sequences.csv')
    train_labels = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/train_labels.csv')
    test_sequences = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/test_sequences.csv')
    return train_sequences, train_labels, test_sequences

# Preprocessing function with fixed sequence length and scaled data
def preprocess_data(train_sequences, train_labels, test_sequences, fixed_seq_length=50):
    # One-hot encoding mapping
    nucleotides = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'U': [0,0,0,1]}
    default_nuc = [0.25, 0.25, 0.25, 0.25]  # For non-standard nucleotides
    
    # Process training data
    X_train = []
    y_train = []
    valid_counts = 0
    
    for idx, row in train_sequences.iterrows():
        target_id = row['target_id']
        seq = row['sequence']
        
        # Skip sequences longer than our fixed length to avoid truncation issues
        if len(seq) > fixed_seq_length * 2:
            continue
            
        # Get labels for this sequence
        target_labels = train_labels[train_labels['ID'].str.startswith(target_id + '_')]
        
        if len(target_labels) > 0:
            # Check if target_labels has valid numeric data
            has_valid_coordinates = True
            for _, label_row in target_labels.iterrows():
                if (pd.isna(label_row['x_1']) or pd.isna(label_row['y_1']) or pd.isna(label_row['z_1'])):
                    has_valid_coordinates = False
                    break
            
            if not has_valid_coordinates:
                continue
                
            # Create fixed-length sequence representation
            seq_encoded = np.zeros((fixed_seq_length, 4))
            for i in range(min(len(seq), fixed_seq_length)):
                seq_encoded[i] = nucleotides.get(seq[i], default_nuc)

                # Add small Gaussian noise for augmentation (only during training)
                seq_encoded[i] += np.random.normal(0, 0.01, size=(4,))
            
            # Create fixed-length coordinate array
            coords = np.zeros((fixed_seq_length, 3))
            for _, label_row in target_labels.iterrows():
                resid = label_row['resid']
                if 1 <= resid <= fixed_seq_length:
                    coords[resid-1] = [
                        float(label_row['x_1']), 
                        float(label_row['y_1']), 
                        float(label_row['z_1'])
                    ]
            
            # Skip if all coordinates are zero (would cause training issues)
            if np.all(coords == 0):
                continue
                
            X_train.append(seq_encoded)
            y_train.append(coords)
            valid_counts += 1
    
    print(f"Using {valid_counts} valid training examples")
    
    # Convert to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    # Use MinMaxScaler instead of StandardScaler for better numerical stability
    scaler = MinMaxScaler(feature_range=(-1, 1))  # Range from -1 to 1
    y_train_reshaped = y_train.reshape(-1, 3)
    y_train_normalized = scaler.fit_transform(y_train_reshaped)
    y_train = y_train_normalized.reshape(y_train.shape)
    
    # Check for any remaining NaN values and replace them
    X_train = np.nan_to_num(X_train)
    y_train = np.nan_to_num(y_train)
    
    # Process test data
    X_test = []
    test_ids = []
    test_seq_lengths = []
    
    for idx, row in test_sequences.iterrows():
        target_id = row['target_id']
        seq = row['sequence']
        test_seq_lengths.append(len(seq))
        
        # Create fixed-length sequence representation
        seq_encoded = np.zeros((fixed_seq_length, 4))
        for i in range(min(len(seq), fixed_seq_length)):
            seq_encoded[i] = nucleotides.get(seq[i], default_nuc)
        
        X_test.append(seq_encoded)
        test_ids.append(target_id)
    
    X_test = np.array(X_test)
    
    return X_train, y_train, X_test, test_ids, test_seq_lengths, scaler

# Extremely simple model to avoid NaN issues
def build_better_model(seq_length):
    model = models.Sequential([
        layers.Input(shape=(seq_length, 4)),
        layers.Conv1D(64, 5, activation='relu', padding='same'),
        layers.Conv1D(128, 5, activation='relu', padding='same'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(256, activation='relu'),
        layers.Dense(seq_length * 3),
        layers.Reshape((seq_length, 3))
    ])

    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-3, 
        clipnorm=1.0
    )

    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber())
    return model

# Generate 5 different predictions
def generate_predictions(model, X_test, scaler, test_seq_lengths):
    # Base prediction
    base_pred = model.predict(X_test)
    
    # Create 5 different predictions
    all_preds = []
    
    # First is the base prediction
    all_preds.append(base_pred)
    
    # Add 4 variations with small noise
    for i in range(4):
        noise = np.random.normal(0, 0.02 * (i+1), base_pred.shape)
        noisy_pred = base_pred + noise
        all_preds.append(noisy_pred)
    
    # Denormalize
    all_denorm = []
    for pred in all_preds:
        pred_flat = pred.reshape(-1, 3)
        denorm_flat = scaler.inverse_transform(pred_flat)
        denorm = denorm_flat.reshape(pred.shape)
        all_denorm.append(denorm)
    
    return all_denorm

# Create submission file
def create_submission(predictions, test_sequences, test_seq_lengths):
    sample_submission = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/sample_submission.csv')
    submission_rows = []
    
    for idx, target_id in enumerate(test_sequences['target_id']):
        seq = test_sequences.loc[test_sequences['target_id'] == target_id, 'sequence'].values[0]
        seq_length = len(seq)
        
        for i in range(seq_length):
            row = {
                'ID': f"{target_id}_{i+1}",
                'resname': seq[i],
                'resid': i+1
            }
            
            # Add coordinates for all 5 predictions
            for model_idx in range(5):
                pred = predictions[model_idx][idx]
                if i < len(pred):
                    row[f'x_{model_idx+1}'] = pred[i, 0]
                    row[f'y_{model_idx+1}'] = pred[i, 1] 
                    row[f'z_{model_idx+1}'] = pred[i, 2]
                else:
                    # Use last prediction for positions beyond model's sequence length
                    row[f'x_{model_idx+1}'] = pred[-1, 0]
                    row[f'y_{model_idx+1}'] = pred[-1, 1]
                    row[f'z_{model_idx+1}'] = pred[-1, 2]
            
            submission_rows.append(row)
    
    # Create DataFrame with same columns as sample submission
    submission = pd.DataFrame(submission_rows)
    submission = submission[sample_submission.columns]
    
    return submission

2025-04-28 13:40:20.037217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745847620.226363      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745847620.277538      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Fixed sequence length - using smaller value
FIXED_SEQ_LENGTH = 50

# 1. Load data
train_sequences, train_labels, test_sequences = load_data()

# 2. Preprocess with fixed length
X_train, y_train, X_test, test_ids, test_seq_lengths, scaler = preprocess_data(
    train_sequences, train_labels, test_sequences, FIXED_SEQ_LENGTH
)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Test data shape: {X_test.shape}")

# 3. Build and train model
model = build_better_model(FIXED_SEQ_LENGTH)
model.summary()

# Use early stopping to prevent overfitting and detect NaN
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

# Reduced epochs and batch size
history = model.fit(
    X_train, y_train, 
    epochs=50, 
    batch_size=32, 
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# Check if training was successful
if np.isnan(history.history['loss'][-1]):
    print("Warning: NaN loss detected. Using fallback prediction method.")
    # Create a fallback prediction based on average coordinates
    avg_coords = np.mean(y_train, axis=0)
    base_pred = np.tile(avg_coords, (len(X_test), 1, 1))
    
    # Manually create 5 predictions with slight variations
    all_preds = [base_pred]
    for i in range(4):
        noise = np.random.normal(0, 0.05 * (i+1), base_pred.shape)
        noisy_pred = base_pred + noise
        all_preds.append(noisy_pred)
        
    # Denormalize
    predictions = []
    for pred in all_preds:
        pred_flat = pred.reshape(-1, 3)
        denorm_flat = scaler.inverse_transform(pred_flat)
        denorm = denorm_flat.reshape(pred.shape)
        predictions.append(denorm)
else:
    # Normal prediction if training succeeded
    predictions = generate_predictions(model, X_test, scaler, test_seq_lengths)

# 5. Create submission
submission = create_submission(predictions, test_sequences, test_seq_lengths)

# 6. Save submission
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Using 491 valid training examples
Training data shape: (491, 50, 4), (491, 50, 3)
Test data shape: (12, 50, 4)


I0000 00:00:1745847653.123080      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/50


I0000 00:00:1745847656.484759      57 service.cc:148] XLA service 0x791c48005d90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745847656.485332      57 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1745847656.813163      57 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/14[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m54s[0m 4s/step - loss: 0.0483

I0000 00:00:1745847658.553311      57 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 142ms/step - loss: 0.0391 - val_loss: 0.0453
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0131 - val_loss: 0.0291
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0087 - val_loss: 0.0221
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0059 - val_loss: 0.0171
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0052 - val_loss: 0.0166
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0051 - val_loss: 0.0173
Epoch 7/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0051 - val_loss: 0.0174
Epoch 8/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0050 - val_loss: 0.0172
Epoch 9/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [3]:
submission

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,R1107_1,G,1,-16.466221,4.980183,-27.718533,1.573927,12.680355,-16.037476,-41.938035,27.809080,-36.271541,-61.542274,-27.909282,-48.973402,-59.677660,40.859026,-25.910892
1,R1107_2,G,2,-17.194563,9.697314,-23.116400,10.483200,-1.598007,-15.474854,-49.275897,-1.254517,-33.084076,-19.186252,-37.860097,7.063231,-151.174862,91.739837,-2.268101
2,R1107_3,G,3,-16.178947,12.606621,-29.664845,-24.672972,33.540272,-40.880025,-27.351244,31.711048,-41.519940,5.753744,-22.123652,-35.290154,-37.746390,-9.933882,-3.216446
3,R1107_4,G,4,-16.399809,16.219158,-21.268335,-3.391703,8.806702,-16.033356,-27.160611,33.441669,-12.884974,-54.108489,10.221653,-44.916062,-18.944528,19.285233,-15.576215
4,R1107_5,G,5,-26.273947,13.949676,-29.064157,-8.297885,5.327002,-42.928662,-45.753452,39.818413,-11.890366,64.831576,-49.948093,-65.026374,-72.715493,-5.371003,-36.176984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2510,R1190_114,U,114,-9.295615,-0.585306,-4.526742,-14.770141,-7.299410,-10.755224,1.973270,18.844244,-15.907896,14.512626,54.027217,-23.275496,12.289430,-19.373118,-42.693316
2511,R1190_115,U,115,-9.295615,-0.585306,-4.526742,-14.770141,-7.299410,-10.755224,1.973270,18.844244,-15.907896,14.512626,54.027217,-23.275496,12.289430,-19.373118,-42.693316
2512,R1190_116,U,116,-9.295615,-0.585306,-4.526742,-14.770141,-7.299410,-10.755224,1.973270,18.844244,-15.907896,14.512626,54.027217,-23.275496,12.289430,-19.373118,-42.693316
2513,R1190_117,U,117,-9.295615,-0.585306,-4.526742,-14.770141,-7.299410,-10.755224,1.973270,18.844244,-15.907896,14.512626,54.027217,-23.275496,12.289430,-19.373118,-42.693316
