# Local Shuffle Pipeline
This Jupyter Notebook implements a pipeline for shuffling exonic regions in transcript sequences while preserving intronic boundaries.

It is designed to evaluate the sensitivity of splicing models like SpliceAI to sequence-level perturbations. The notebook includes:
- Reading junction metadata
- Randomizing exon content
- Writing out modified sequences

## Reproducibility
Ensure that you have the appropriate directory structure and all required input files.

You may want to use a Python environment with the following packages:
```bash
pip install numpy pandas
```


In [None]:
import random

def permute_exons(sequence, exon_starts, exon_ends, seq_start):
    """
    Shuffle each exon in-place, preserving the introns.
    All coordinates are absolute and will be adjusted relative to the sequence start.
    """
    sequence = list(sequence)
    for abs_start, abs_end in zip(exon_starts, exon_ends):
        start = abs_start - seq_start
        end = abs_end - seq_start
        if start < 0 or end > len(sequence) or start >= end:
            print(f"Skipping invalid exon range: {abs_start}-{abs_end}")
            continue
        exon_seq = list(sequence[start:end])
        random.shuffle(exon_seq)
        sequence[start:end] = exon_seq
    return ''.join(sequence)

def parse_junctions_by_coordinates(file_path):
    junctions = {}
    with open(file_path, "r") as f:
        for line in f:
            fields = line.strip().split("\t")
            if len(fields) < 6:
                continue
            chrom = fields[2]
            strand = fields[3]
            try:
                tx_start = int(fields[4])
                tx_end = int(fields[5])
                exon_starts = list(map(int, fields[6].split(","))) if len(fields) > 6 else []
                exon_ends = list(map(int, fields[7].split(","))) if len(fields) > 7 else []
            except ValueError:
                continue
            coord_key = f"{chrom}:{tx_start}-{tx_end}"
            junctions[coord_key] = {
                "chrom": chrom,
                "strand": strand,
                "exon_starts": exon_starts,
                "exon_ends": exon_ends,
                "seq_start": tx_start
            }
    return junctions

import random

def permute_exons(sequence, exon_starts, exon_ends, seq_start):
    sequence = list(sequence)
    for abs_start, abs_end in zip(exon_starts, exon_ends):
        start = abs_start - seq_start
        end = abs_end - seq_start
        if start < 0 or end > len(sequence) or start >= end:
            print(f"Skipping invalid exon range: {abs_start}-{abs_end}")
            continue

        original = sequence[start:end]
        permuted = original.copy()
        random.shuffle(permuted)

        sequence[start:end] = permuted
        #print(f"PERMUTED [{abs_start}-{abs_end}]  | original: {''.join(original[:10])}... → shuffled: {''.join(permuted[:10])}...")
    return ''.join(sequence)

#Test on the example
header = "chr1:934344-935477"
sequence = "GCGGGCCTGGAGCCGGGATCCGCCCTAGGGGCTCGGATCGCCGCGCGCTCGCCGCTCGCCCGCCAGCCCGCCCGTGGTCCGTGGCGGCGCGCTCCACCCGGCACGGGGAGGCGCGGGGCGCACCATGGCCGCAGACACGCCGGGGAAACCGAGCGCCTCGCCGATGGCAGGAGCGCCGGCCAGCGCCAGCCGGACCCCAGACAAGCCCCGGAGCGCGGCCGAGCACCGCAAGGTGGGGTCCCGGCCGGGCGTGAGGGGGGCGACCGGGGGGCGGGAGGGACGCGGGACTCAGCCGGTGCCCGACCCGCAGTCCTCCAAGCCGGTCATGGAGAAGCGGCGCCGAGCGCGTATTAACGAGAGCCTCGCTCAGCTCAAAACCCTCATCCTGGACGCCCTCAGAAAAGAGGTAAGTCGGGGGCGAAGGCCCGAGACCCGGAGTCTGGGTCGCAGCTGACCTGGACCTCCCGCCTATCCCCGCCCCCAGAGCTCCCGCCACTCGAAGCTGGAGAAGGCGGACATCCTGGAGATGACCGTGAGACACCTGCGGAGCCTGCGTCGCGTGCAGGTGACGGGTGAGGCGCGGGCGGCGGCGGCTTGGAGGCGGGGGGAGGGCGCGGGACCCCCGGGACCCGGCACCGACCTCTCCTCCTGTGTCGCTCCCGCAGCCGCGCTCAGCGCCGACCCCGCCGTTCTGGGCAAGTACCGCGCCGGCTTCCACGAGTGTCTGGCGGAGGTGAACCGCTTCCTGGCCGGCTGCGAGGGCGTCCCGGCCGACGTGCGCTCCCGCCTGCTGGGCCAC"

exon_starts = [934812, 934993, 935167]
exon_ends   = [934906, 935072, 935246]
seq_start = 934344

shuffled = permute_exons(sequence, exon_starts, exon_ends, seq_start)
seq_start = 934344
rel1_start = 934812 - seq_start
rel1_end   = 934906 - seq_start

# print("Original Exon 1:", sequence[rel1_start:rel1_start+10])
# print("Shuffled Exon 1:", shuffled[rel1_start:rel1_start+10])

In [None]:
junction_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"
canonical_sequences_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_sequence_unflanked_unshuffled.txt"
output_file = "permuted_exon_sequences.txt"

junctions = parse_junctions_by_coordinates(junction_file)
permute_canonical_exons(canonical_sequences_file, output_file, junctions)

In [None]:
input_file = "/mnt/lareaulab/sdahiyat/illumina/permuted_exon_sequences.txt"
output_file = "/mnt/lareaulab/sdahiyat/illumina/local_permuted_exons_flanked.txt"

# define the flanking sequence of 5000 Ns
flank_length = 5000
flanking_seq = "N" * flank_length

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        line = line.strip()
        if not line:
            continue

        try:
            header, sequence = line.split("\t")
        except ValueError:
            print(f"Skipping malformed line: {line}")
            continue

        if ":" in header and "-" in header:
            chrom, positions = header.split(":")
            start, end = map(int, positions.split("-"))

            # Adjust start and end positions
            new_start = start - 5001
            new_end = end + 5000

            new_header = f"{chrom}:{new_start}-{new_end}"

            flanked_sequence = flanking_seq + sequence + flanking_seq

            outfile.write(f"{new_header}\t{flanked_sequence}\n")
        else:
            print(f"Skipping improperly formatted header: {header}")

print(f"Flanked sequences written to {output_file}.")

In [None]:
import random

def parse_junctions_by_coordinates(file_path):
    junctions = {}
    with open(file_path, "r") as f:
        for line in f:
            fields = line.strip().split("\t")
            if len(fields) < 6:
                continue
            chrom = fields[2]
            strand = fields[3]
            try:
                tx_start = int(fields[4])
                tx_end = int(fields[5])
                exon_starts = list(map(int, fields[6].split(","))) if len(fields) > 6 else []
                exon_ends = list(map(int, fields[7].split(","))) if len(fields) > 7 else []
            except ValueError:
                continue
            coord_key = f"{chrom}:{tx_start}-{tx_end}"
            junctions[coord_key] = {
                "chrom": chrom,
                "strand": strand,
                "exon_starts": exon_starts,
                "exon_ends": exon_ends,
                "seq_start": tx_start
            }
    return junctions

def permute_exons_by_codon(sequence, exon_starts, exon_ends, seq_start):
    """
    Frame-aware codon shuffling: tracks codon alignment across exons.
    Only full codons within each exon are shuffled.
    """
    sequence = list(sequence)
    frame_offset = 0  # total nucleotides encountered before this exon

    for abs_start, abs_end in zip(exon_starts, exon_ends):
        start = abs_start - seq_start
        end = abs_end - seq_start
        exon_len = end - start

        if start < 0 or end > len(sequence) or start >= end:
            print(f"Skipping invalid exon range: {abs_start}-{abs_end}")
            continue

        offset = frame_offset % 3

        if offset > 0:
            untouched_head = sequence[start:start+offset]
            shufflable_region = sequence[start+offset:end]
        else:
            untouched_head = []
            shufflable_region = sequence[start:end]

        codon_count = len(shufflable_region) // 3
        codons = [''.join(shufflable_region[i*3:(i+1)*3]) for i in range(codon_count)]
        tail = shufflable_region[codon_count*3:]  # remainder (1–2 nt)

        random.shuffle(codons)

        shuffled_exon = list(''.join(untouched_head) + ''.join(codons) + ''.join(tail))
        sequence[start:end] = shuffled_exon

        frame_offset += exon_len

    return ''.join(sequence)

def permute_canonical_exons_by_codon(canonical_sequences_file, output_file, junctions):
    with open(canonical_sequences_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            if not line.strip():
                continue
            try:
                header, sequence = line.strip().split("\t")
            except ValueError:
                print(f"Skipping malformed line: {line.strip()}")
                continue

            transcript_key = header.strip()
            if transcript_key in junctions:
                exon_info = junctions[transcript_key]
                exon_starts = exon_info["exon_starts"]
                exon_ends = exon_info["exon_ends"]
                seq_start = exon_info["seq_start"]

                shuffled = permute_exons_by_codon(sequence, exon_starts, exon_ends, seq_start)
                outfile.write(f"{header}\t{shuffled}\n")
            else:
                print(f"Junctions not found for: {transcript_key}")
                outfile.write(f"{header}\t{sequence}\n")

In [None]:
junction_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"
canonical_sequences_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_sequence_unflanked_unshuffled.txt"
output_file = "/mnt/lareaulab/sdahiyat/illumina/permuted_exons_by_codon.txt"

junctions = parse_junctions_by_coordinates(junction_file)
permute_canonical_exons_by_codon(canonical_sequences_file, output_file, junctions)

In [None]:
input_file = "/mnt/lareaulab/sdahiyat/illumina/permuted_exons_by_codon.txt"
output_file = "/mnt/lareaulab/sdahiyat/illumina/local_permuted_codons_flanked.txt"

# define the flanking sequence of 5000 Ns
flank_length = 5000
flanking_seq = "N" * flank_length

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        line = line.strip()
        if not line:
            continue

        try:
            header, sequence = line.split("\t")
        except ValueError:
            print(f"Skipping malformed line: {line}")
            continue

        # Extract chromosome, start, and end positions
        if ":" in header and "-" in header:
            chrom, positions = header.split(":")
            start, end = map(int, positions.split("-"))

            # Adjust start and end positions
            new_start = start - 5001
            new_end = end + 5000

            new_header = f"{chrom}:{new_start}-{new_end}"

            flanked_sequence = flanking_seq + sequence + flanking_seq

            outfile.write(f"{new_header}\t{flanked_sequence}\n")
        else:
            print(f"Skipping improperly formatted header: {header}")

print(f"Flanked sequences written to {output_file}.")