## A very important aspect of this project is making sure all the datasets are aligned with the hypotheses we are testing. I coded this test to ensure that the introns remained unshuffled and the exons were shuffled as expected. I always crosscheck the outputted intronic and exonic sequences with UCSC Genome Browswer. Paste the 'chr x: txstart-txend' header into the browser, and it will output the canonical sequence : https://genome.ucsc.edu/cgi-bin/hgc?hgsid=2527996562_ZzJQQA4gTz8Jemae9meIu3kk5aZ8&o=895963&g=getDna&i=mixed&c=chr1&l=895963&r=901099&db=hg19

## Immediately below is the sanity check for unshuffled sequences from which all other shuffled sequences are created.

In [6]:
shuffled_codons_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_sequence_unflanked_unshuffled.txt"
canonical_dataset_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"

# Define the target transcript, this is just one random example in my dataset
target_transcript = "chr1:901862-911245"

# Step 1: Read the shuffled codons file to extract the sequence
sequences = {}
with open(shuffled_codons_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  
        try:
            header, sequence = line.split("\t")
            sequences[header] = sequence
        except ValueError:
            print(f"Skipping malformed line: {line}")

# Step 2: Parse junctions from the canonical dataset
junctions = {}
with open(canonical_dataset_file, "r") as f:
    for line in f:
        fields = line.strip().split("\t")
        if len(fields) < 8: 
            continue

        chrom = fields[2]  # Chromosome
        start = int(fields[4])  # Start position
        end = int(fields[5])    # End position
        exon_starts = list(map(int, fields[6].split(","))) if fields[6] else []
        exon_ends = list(map(int, fields[7].split(","))) if fields[7] else []

        transcript_key = f"{chrom}:{start}-{end}"
        junctions[transcript_key] = (exon_starts, exon_ends)

# Step 3: Extract exons and introns using the junctions
if target_transcript in sequences and target_transcript in junctions:
    full_sequence = sequences[target_transcript]  # gets shuffled sequence
    exon_starts, exon_ends = junctions[target_transcript]  #gets junctions
    seq_start = int(target_transcript.split(":")[1].split("-")[0])  
    exon_starts = [s - seq_start for s in exon_starts]
    exon_ends = [e - seq_start for e in exon_ends]

    print(f"Debugging Adjusted Junctions for {target_transcript}")
    print(f"Adjusted Exon Starts: {exon_starts}")
    print(f"Adjusted Exon Ends: {exon_ends}")
    print(f"Sequence Length: {len(full_sequence)}")

    exons = []
    introns = []
    last_pos = 0  

    for exon_start, exon_end in zip(exon_starts, exon_ends):
        
        if 0 <= exon_start < len(full_sequence) and 0 <= exon_end <= len(full_sequence):
            introns.append(full_sequence[last_pos:exon_start]) 
            exons.append(full_sequence[exon_start:exon_end]) 
        else:
            print(f" Warning: Adjusted Exon range ({exon_start}-{exon_end}) out of bounds.")
        
        last_pos = exon_end

    introns.append(full_sequence[last_pos:])  

    import pandas as pd
    exon_intron_data = pd.DataFrame({
        "Type": ["Exon"] * len(exons) + ["Intron"] * len(introns),
        "Sequence": exons + introns
    })


    print(f"Exon and Intron Sequences {exon_intron_data}")

else:
    print(f"No sequence data found for {target_transcript}.")


Debugging Adjusted Junctions for chr1:901862-911245
Adjusted Exon Starts: [132, 321, 3941, 4119, 4276, 4524, 4726, 4922, 5668, 5942, 6528, 6844, 7158, 7569, 7882]
Adjusted Exon Ends: [222, 3795, 4039, 4204, 4397, 4631, 4842, 5593, 5806, 6379, 6704, 7018, 7351, 7834, 7960]
Sequence Length: 9384
Exon and Intron Sequences       Type                                           Sequence
0     Exon  GGTGAGCGGGGCGTGGGTGCGGCCACCTGGGCGCAGGGCTCCCCCA...
1     Exon  GGTGAGCGCGGCGTGCACGGTGGCTGTGGTCTGGGAGCGTGGCTCT...
2     Exon  GGTGGGGGCCGGGCTGGGTGGAGCACGCTAAGGGTGCAGCATCCCC...
3     Exon  GGTGAGGCGGTGGGCAATGGGGTGGGGCCATGGCCGCCCTTCCCTC...
4     Exon  GGTGTTTGGGATGCTTCCCGGGCCCCCAGAGGCACTCCTGACCCAG...
5     Exon  GGTCAGTGCCGGGGACCCCACCCCCCTCCCCACCCTGATCCTCGCA...
6     Exon  GGTGGGTGGGAGGTGCGTGGGGCTGTAGGGGGATGGGAGGGGTGCA...
7     Exon  GGTGGGCCCCTCCCCACTGTGGGCCCGCCCCAGGGAGGCAGCTGTG...
8     Exon  GGTAGGGCCCTGACCCTGGTTCTGCCTCCCGCCTGGCCAGGCCATG...
9     Exon  GGTGAGGGGTCAATAGGCCCCACAGCCCAGGTCCTGGGCAGTGGTA.

## check for shuffled nucelotide exons

In [4]:
shuffled_codons_file = "/mnt/lareaulab/sdahiyat/illumina/shuffled_nt_new.txt"
canonical_dataset_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"

target_transcript = "chr1:901862-911245"

# Step 1: Read the shuffled codons file to extract the sequence
sequences = {}
with open(shuffled_codons_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  
        try:
            header, sequence = line.split("\t")
            sequences[header] = sequence
        except ValueError:
            print(f"Skipping malformed line: {line}")

# Step 2: Parse junctions from the canonical dataset
junctions = {}
with open(canonical_dataset_file, "r") as f:
    for line in f:
        fields = line.strip().split("\t")
        if len(fields) < 8: 
            continue

        chrom = fields[2]  
        start = int(fields[4]) 
        end = int(fields[5])
        exon_starts = list(map(int, fields[6].split(","))) if fields[6] else []
        exon_ends = list(map(int, fields[7].split(","))) if fields[7] else []

        transcript_key = f"{chrom}:{start}-{end}"
        junctions[transcript_key] = (exon_starts, exon_ends)

#Step 3: Extract exons and introns using the junctions
if target_transcript in sequences and target_transcript in junctions:
    full_sequence = sequences[target_transcript]  
    exon_starts, exon_ends = junctions[target_transcript]  

    #Adjust exon coordinates relative to sequence start
    seq_start = int(target_transcript.split(":")[1].split("-")[0])  
    exon_starts = [s - seq_start for s in exon_starts]
    exon_ends = [e - seq_start for e in exon_ends]

    print(f"de bugging Adjusted Junctions for {target_transcript}")
    print(f"Adjusted Exon Starts: {exon_starts}")
    print(f"Adjusted Exon Ends: {exon_ends}")
    print(f"Sequence Length: {len(full_sequence)}")

    exons = []
    introns = []
    last_pos = 0 

    for exon_start, exon_end in zip(exon_starts, exon_ends):
        if 0 <= exon_start < len(full_sequence) and 0 <= exon_end <= len(full_sequence):
            introns.append(full_sequence[last_pos:exon_start])  
            exons.append(full_sequence[exon_start:exon_end])  
        else:
            print(f"Warning: Adjusted Exon range ({exon_start}-{exon_end}) out of bounds.")
        
        last_pos = exon_end

    introns.append(full_sequence[last_pos:])  


    import pandas as pd
    exon_intron_data = pd.DataFrame({
        "Type": ["Exon"] * len(exons) + ["Intron"] * len(introns),
        "Sequence": exons + introns
    })


    print(f"Exon and Intron Sequences {exon_intron_data}")

else:
    print(f"No sequence data found for {target_transcript}.")

de bugging Adjusted Junctions for chr1:901862-911245
Adjusted Exon Starts: [132, 321, 3941, 4119, 4276, 4524, 4726, 4922, 5668, 5942, 6528, 6844, 7158, 7569, 7882]
Adjusted Exon Ends: [222, 3795, 4039, 4204, 4397, 4631, 4842, 5593, 5806, 6379, 6704, 7018, 7351, 7834, 7960]
Sequence Length: 9384
Exon and Intron Sequences       Type                                           Sequence
0     Exon  TTAGTATGATCACATGGTGTGTAGAAGTTTGTTTTATAATTTTAAA...
1     Exon  CGACGGGCGTGGTTCGTGAAGATCCTAAGTTACCATGGCTACTTTG...
2     Exon  ATAACTGCGTGCAAGATCTTTACTGTATTAGTAGGGCGTACGTGAT...
3     Exon  GCCCTAATATTCTTTAGCTCATTCCGGAATCAATTTAACTGGAGGT...
4     Exon  CAATACTACTGATCTATAGAATAAAAGCTACGTTATTGCTCCGGTA...
5     Exon  TACCGTTGAATTCAAAAAGGTTTTAGACTATTCGAAGGCAGATAGG...
6     Exon  GAATACGATAGGGCAAGGATCCGTTTATCGAAGTGTTCATCATACG...
7     Exon  GTTCGCTTAACGTAGACGAATATAGCTTTATCCCCCTGGAACGAAC...
8     Exon  TGTACGGACTATTAGATTAACTACTAGGAGATAGCGTAGCAATAGT...
9     Exon  TCTATGTATTATTACGCTTTTTAAAAGTGTATCGTGCCGATTCAAG

In [5]:
shuffled_codons_file = "/mnt/lareaulab/sdahiyat/illumina/shuffled_codons_from_codon_freq.txt"
canonical_dataset_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"

# define the target transcript
target_transcript = "chr1:901862-911245"

# Step 1: Read the shuffled codons file to extract the sequence
sequences = {}
with open(shuffled_codons_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            header, sequence = line.split("\t")
            sequences[header] = sequence
        except ValueError:
            print(f"Skipping malformed line: {line}")

# Step 2: Parse junctions from the canonical dataset
junctions = {}
with open(canonical_dataset_file, "r") as f:
    for line in f:
        fields = line.strip().split("\t")
        if len(fields) < 8:  
            continue

        chrom = fields[2]  # Chromosome
        start = int(fields[4])  # Start position
        end = int(fields[5])    # End position
        exon_starts = list(map(int, fields[6].split(","))) if fields[6] else []
        exon_ends = list(map(int, fields[7].split(","))) if fields[7] else []

        transcript_key = f"{chrom}:{start}-{end}"
        junctions[transcript_key] = (exon_starts, exon_ends)

# Step 3: Extract exons and introns using the junctions
if target_transcript in sequences and target_transcript in junctions:
    full_sequence = sequences[target_transcript] 
    exon_starts, exon_ends = junctions[target_transcript]  

    seq_start = int(target_transcript.split(":")[1].split("-")[0])  
    exon_starts = [s - seq_start for s in exon_starts]
    exon_ends = [e - seq_start for e in exon_ends]

    print(f"Debugging Adjusted Junctions for {target_transcript}")
    print(f"Adjusted Exon Starts: {exon_starts}")
    print(f"Adjusted Exon Ends: {exon_ends}")
    print(f"Sequence Length: {len(full_sequence)}")

    exons = []
    introns = []
    last_pos = 0 

    for exon_start, exon_end in zip(exon_starts, exon_ends):
        if 0 <= exon_start < len(full_sequence) and 0 <= exon_end <= len(full_sequence):
            introns.append(full_sequence[last_pos:exon_start])  
            exons.append(full_sequence[exon_start:exon_end])  
        else:
            print(f"Warning: Adjusted Exon range ({exon_start}-{exon_end}) out of bounds.")
        
        last_pos = exon_end

    introns.append(full_sequence[last_pos:])  

    import pandas as pd
    exon_intron_data = pd.DataFrame({
        "Type": ["Exon"] * len(exons) + ["Intron"] * len(introns),
        "Sequence": exons + introns
    })

    

    print(f"Exon and Intron Sequences {exon_intron_data}")

else:
    print(f"No sequence data found for {target_transcript}.")


Debugging Adjusted Junctions for chr1:901862-911245
Adjusted Exon Starts: [132, 321, 3941, 4119, 4276, 4524, 4726, 4922, 5668, 5942, 6528, 6844, 7158, 7569, 7882]
Adjusted Exon Ends: [222, 3795, 4039, 4204, 4397, 4631, 4842, 5593, 5806, 6379, 6704, 7018, 7351, 7834, 7960]
Sequence Length: 9384
Exon and Intron Sequences       Type                                           Sequence
0     Exon  TCACTAAAGCACCAGACGCAGGTGTCATTGAAATTCCATATCTTGA...
1     Exon  TACCACGATATATCAAAGTTGTTGTCTCAAGTTGATTGGAAGAAAC...
2     Exon  ACCAAGGAAAACACTCACGTGCTCAAAAACGAAAAATGAACAACCG...
3     Exon  CAGCCACAGGACTTTTGTTTGTGTTGGTTCCACAAGCTGATCGCAT...
4     Exon  GTTCACCAGACAAGGTGGCTATTACTGTTTGGTATCAGGATGAGTG...
5     Exon  AGGCCAAGCTCTTGATCAATATGAAGGGAAGGTCTGACCCTTCTGC...
6     Exon  TACTATCCCCTCCCTCTCAGTTGTGTAGTATCGAGTCTTTGATAAA...
7     Exon  TTTATTAGAGGAAGTAAACAACGAACCAGATAAAATTTTGCTAAAA...
8     Exon  CACGGTGCTTGGTGAAGGACAAAATCCCTGTTGCCTGGAATGTTAA...
9     Exon  TAAATGACACTTCCCAGCACATGGACTTGTAGAACGGATCAAGTAA.

In [3]:
shuffled_codons_file = "/mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled.txt"
canonical_dataset_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_filtered_motifs.txt"

# fefine the target transcript
target_transcript = "chr1:901862-911245"

# Step 1: Read the shuffled codons file to extract the sequence
sequences = {}
with open(shuffled_codons_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            header, sequence = line.split("\t")
            sequences[header] = sequence
        except ValueError:
            print(f"Skipping malformed line: {line}")

# Step 2: Parse junctions from the canonical dataset
junctions = {}
with open(canonical_dataset_file, "r") as f:
    for line in f:
        fields = line.strip().split("\t")
        if len(fields) < 8:  
            continue

        chrom = fields[2]  # Chromosome
        start = int(fields[4])  # Start position
        end = int(fields[5])    # End position
        exon_starts = list(map(int, fields[6].split(","))) if fields[6] else []
        exon_ends = list(map(int, fields[7].split(","))) if fields[7] else []

        transcript_key = f"{chrom}:{start}-{end}"
        junctions[transcript_key] = (exon_starts, exon_ends)

# Step 3: Extract exons and introns using the junctions
if target_transcript in sequences and target_transcript in junctions:
    full_sequence = sequences[target_transcript] 
    exon_starts, exon_ends = junctions[target_transcript]  

    seq_start = int(target_transcript.split(":")[1].split("-")[0])  
    exon_starts = [s - seq_start for s in exon_starts]
    exon_ends = [e - seq_start for e in exon_ends]

    print(f"Debugging Adjusted Junctions for {target_transcript}")
    print(f"Adjusted Exon Starts: {exon_starts}")
    print(f"Adjusted Exon Ends: {exon_ends}")
    print(f"Sequence Length: {len(full_sequence)}")

    exons = []
    introns = []
    last_pos = 0 

    for exon_start, exon_end in zip(exon_starts, exon_ends):
        if 0 <= exon_start < len(full_sequence) and 0 <= exon_end <= len(full_sequence):
            introns.append(full_sequence[last_pos:exon_start])  
            exons.append(full_sequence[exon_start:exon_end])  
        else:
            print(f"Warning: Adjusted Exon range ({exon_start}-{exon_end}) out of bounds.")
        
        last_pos = exon_end

    introns.append(full_sequence[last_pos:])  

    import pandas as pd
    exon_intron_data = pd.DataFrame({
        "Type": ["Exon"] * len(exons) + ["Intron"] * len(introns),
        "Sequence": exons + introns
    })


    print(f"Exon and Intron Sequences {exon_intron_data}")

else:
    print(f"No sequence data found for {target_transcript}.")


Debugging Adjusted Junctions for chr1:901862-911245
Adjusted Exon Starts: [132, 321, 3941, 4119, 4276, 4524, 4726, 4922, 5668, 5942, 6528, 6844, 7158, 7569, 7882]
Adjusted Exon Ends: [222, 3795, 4039, 4204, 4397, 4631, 4842, 5593, 5806, 6379, 6704, 7018, 7351, 7834, 7960]
Sequence Length: 9384
Exon and Intron Sequences       Type                                           Sequence
0     Exon  GGTGAGCGGGGCGTGGGTGCGGCCACCTGGGCGCAGGGCTCCCCCA...
1     Exon  GGTGAGCGCGGCGTGCACGGTGGCTGTGGTCTGGGAGCGTGGCTCT...
2     Exon  GGTGGGGGCCGGGCTGGGTGGAGCACGCTAAGGGTGCAGCATCCCC...
3     Exon  GGTGAGGCGGTGGGCAATGGGGTGGGGCCATGGCCGCCCTTCCCTC...
4     Exon  GGTGTTTGGGATGCTTCCCGGGCCCCCAGAGGCACTCCTGACCCAG...
5     Exon  GGTCAGTGCCGGGGACCCCACCCCCCTCCCCACCCTGATCCTCGCA...
6     Exon  GGTGGGTGGGAGGTGCGTGGGGCTGTAGGGGGATGGGAGGGGTGCA...
7     Exon  GGTGGGCCCCTCCCCACTGTGGGCCCGCCCCAGGGAGGCAGCTGTG...
8     Exon  GGTAGGGCCCTGACCCTGGTTCTGCCTCCCGCCTGGCCAGGCCATG...
9     Exon  GGTGAGGGGTCAATAGGCCCCACAGCCCAGGTCCTGGGCAGTGGTA.