# EDA: Stanford RNA 3D Folding Part 2

Exploration of:
- Train/validation sequence length distribution
- MSA depth distribution
- Multi-chain (stoichiometry) distribution
- Ligand frequency
- Temporal cutoff distribution

In [ ]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Paths - adjust to your data location
DATA_DIR = Path("../data")  # or Kaggle input path
train_seq = pd.read_csv(DATA_DIR / "train_sequences.csv")
train_lab = pd.read_csv(DATA_DIR / "train_labels.csv")
val_seq = pd.read_csv(DATA_DIR / "validation_sequences.csv") if (DATA_DIR / "validation_sequences.csv").exists() else None

In [ ]:
# Sequence length distribution
train_seq["seq_len"] = train_seq["sequence"].str.len()
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.hist(train_seq["seq_len"], bins=50, edgecolor="black", alpha=0.7)
ax.set_xlabel("Sequence length")
ax.set_ylabel("Count")
ax.set_title("Train sequence length distribution")
plt.tight_layout()
plt.show()

In [ ]:
# Stoichiometry (multi-chain) distribution
stoi_counts = train_seq["stoichiometry"].str.count(";").add(1)
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
stoi_counts.value_counts().sort_index().plot(kind="bar", ax=ax)
ax.set_xlabel("Number of chain types in stoichiometry")
ax.set_ylabel("Count")
ax.set_title("Stoichiometry complexity")
plt.tight_layout()
plt.show()

In [ ]:
# Temporal cutoff distribution
if "temporal_cutoff" in train_seq.columns:
    train_seq["temporal_cutoff"] = pd.to_datetime(train_seq["temporal_cutoff"], errors="coerce")
    train_seq["year"] = train_seq["temporal_cutoff"].dt.year
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    train_seq["year"].value_counts().sort_index().plot(kind="bar", ax=ax)
    ax.set_xlabel("Year")
    ax.set_ylabel("Count")
    ax.set_title("Temporal cutoff (release date) distribution")
    plt.tight_layout()
    plt.show()

In [ ]:
# Ligand presence
if "ligand_ids" in train_seq.columns:
    has_ligand = train_seq["ligand_ids"].notna() & (train_seq["ligand_ids"].str.len() > 0)
    print("Targets with ligands:", has_ligand.sum())
    print("Targets without ligands:", (~has_ligand).sum())

In [ ]:
# MSA depth (if MSA dir available)
MSA_DIR = DATA_DIR / "MSA"
if MSA_DIR.exists():
    msa_files = list(MSA_DIR.glob("*.MSA.fasta"))
    depths = []
    for f in msa_files[:500]:  # sample
        with open(f) as fp:
            n = sum(1 for line in fp if line.startswith(">"))
        depths.append(n)
    if depths:
        fig, ax = plt.subplots(1, 1, figsize=(8, 4))
        ax.hist(depths, bins=40, edgecolor="black", alpha=0.7)
        ax.set_xlabel("MSA depth (number of sequences)")
        ax.set_ylabel("Count")
        ax.set_title("MSA depth distribution (sample)")
        plt.tight_layout()
        plt.show()