In [None]:
import math
from collections import defaultdict

#real PWM for SRSF1 (Matrix ID 126)
pwm = {
    'A': [0.00031, 0.00031, 0.49969, 0.49969, 0.00031, 0.00031, 0.99906],
    'C': [0.99906, 0.49969, 0.49969, 0.49969, 0.49969, 0.49969, 0.00031],
    'G': [0.00031, 0.49969, 0.00031, 0.00031, 0.49969, 0.49969, 0.00031],
    'T': [0.00031, 0.00031, 0.00031, 0.00031, 0.00031, 0.00031, 0.00031],
}

motif_length = 7

# Convert PWM to log-odds assuming uniform background (0.25)
log_odds_pwm = {}
for base in 'ACGT':
    log_odds_pwm[base] = [math.log2(pwm[base][i] / 0.25) for i in range(motif_length)]


def score_sequence(seq):
    """Calculate PWM score for a given sequence window."""
    score = 0
    for i, base in enumerate(seq):
        if base in log_odds_pwm:
            score += log_odds_pwm[base][i]
        else:
            return None 
    return score


def scan_sequence(seq, threshold):
    """Scan a sequence and return all motif hits with score >= threshold."""
    hits = []
    for i in range(len(seq) - motif_length + 1):
        window = seq[i:i + motif_length]
        score = score_sequence(window)
        if score is not None and score >= threshold:
            hits.append((i, i + motif_length, score, window))
    return hits


def scan_fasta(input_file, output_file, threshold):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            line = line.strip()
            if not line:
                continue
            try:
                transcript_id, seq = line.split('\t')
            except ValueError:
                print(f"Skipping malformed line: {line}")
                continue
            hits = scan_sequence(seq.upper(), threshold)
            for start, end, score, window in hits:
                outfile.write(f"{transcript_id}\t{start}\t{end}\t{score:.3f}\t{window}\n")


#example usage:
# scan_fasta(
#     input_file='/mnt/lareaulab/sdahiyat/illumina/canonical_sequence_unflanked_unshuffled.txt',
#     output_file='/mnt/lareaulab/sdahiyat/datasets/srsf1_motif_hits.tsv',
#     threshold=5.0  #adjust this value depending on the score distribution
# )


In [2]:
import pandas as pd

df = pd.read_csv("/mnt/lareaulab/sdahiyat/datasets/srsf1_motif_hits.tsv", sep="\t", header=None)
df.columns = ["transcript_id", "start", "end", "score", "motif"]
df.head()


Unnamed: 0,transcript_id,start,end,score,motif
0,chr1:65419-71585,1717,1724,8.993,CCAACCA
1,chr1:65419-71585,1721,1728,8.993,CCAACCA
2,chr1:65419-71585,1988,1995,8.993,CCAAGCA
3,chr1:65419-71585,2762,2769,8.993,CCCAGGA
4,chr1:65419-71585,5088,5095,8.993,CCCACCA


In [3]:
import math

pwm = {
    'A': [0.00031, 0.00031, 0.49969, 0.49969, 0.00031, 0.00031, 0.99906],
    'C': [0.99906, 0.49969, 0.49969, 0.49969, 0.49969, 0.49969, 0.00031],
    'G': [0.00031, 0.49969, 0.00031, 0.00031, 0.49969, 0.49969, 0.00031],
    'T': [0.00031, 0.00031, 0.00031, 0.00031, 0.00031, 0.00031, 0.00031],
}

motif_length = 7
background = 0.25

max_score = 0
for i in range(motif_length):
    max_val = max(math.log2(pwm[base][i] / background) for base in 'ACGT')
    max_score += max_val

print(f"Maximum possible PWM score: {max_score:.3f}")


Maximum possible PWM score: 8.993


In [4]:
import pandas as pd
df = pd.read_csv("/mnt/lareaulab/sdahiyat/datasets/srsf1_motif_hits.tsv", sep="\t", header=None)
df.columns = ["transcript_id", "start", "end", "score", "motif"]

threshold = 5.0
hits_above = df[df["score"] >= threshold]

transcripts_with_hits = set(hits_above["transcript_id"].unique())
print(f"Found {len(transcripts_with_hits)} transcripts with motif hits above threshold.")

input_sequences = "/mnt/lareaulab/sdahiyat/illumina/canonical_sequence_unflanked_unshuffled.txt"
output_sequences = "/mnt/lareaulab/sdahiyat/datasets/motif_high_scoring_transcripts.txt"

with open(input_sequences, "r") as infile, open(output_sequences, "w") as outfile:
    for line in infile:
        line = line.strip()
        if not line:
            continue
        try:
            transcript_id, sequence = line.split("\t")
        except ValueError:
            print(f"Skipping malformed line: {line}")
            continue
        if transcript_id in transcripts_with_hits:
            outfile.write(f"{transcript_id}\t{sequence}\n")

print(f"wrote high scoring transcript sequences to: {output_sequences}")


Found 6085 transcripts with motif hits above threshold.
Wrote high-scoring transcript sequences to: /mnt/lareaulab/sdahiyat/datasets/motif_high_scoring_transcripts.txt


In [8]:
import random
import re

# Define SRSF1 motifs, more can be added if relevant.
srsf1_motifs = ["CCACGGA", "CGACGGA"]

#regex pattern for all motifs
motif_pattern = re.compile('|'.join(srsf1_motifs))

def shuffle_string(s):
    s_list = list(s)
    random.shuffle(s_list)
    return ''.join(s_list)

def shuffle_motif_hits(sequence, motifs_regex):
    matches = list(motifs_regex.finditer(sequence))
    if not matches:
        return None  # No motifs found — return None to signal skipping

    sequence = list(sequence)
    for match in reversed(matches):  # reverse to avoid shifting indexes
        start, end = match.span()
        original = sequence[start:end]
        shuffled = shuffle_string(original)
        sequence[start:end] = shuffled
    return ''.join(sequence)

input_file = "/mnt/lareaulab/sdahiyat/datasets/motif_high_scoring_transcripts.txt"
output_file = "/mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled.txt"

num_written = 0

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        line = line.strip()
        if not line:
            continue
        try:
            header, sequence = line.split("\t")
        except ValueError:
            print(f"Skipping malformed line: {line}")
            continue

        shuffled_seq = shuffle_motif_hits(sequence, motif_pattern)
        if shuffled_seq is not None:
            outfile.write(f"{header}\t{shuffled_seq}\n")
            num_written += 1

print(f"Wrote {num_written} sequences containing motifs to: {output_file}")


Done! Wrote 1930 sequences containing motifs to: /mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled.txt


In [6]:
input_file = "/mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled.txt"
output_file = "/mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled_flanked.txt"

# Define the flanking sequence (5000 Ns)
flank_length = 5000
flanking_seq = "N" * flank_length

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        
        try:
            header, sequence = line.split("\t")
        except ValueError:
            print(f"Skipping malformed line: {line}")
            continue
        
        # Extract chromosome, start, and end positions
        if ":" in header and "-" in header:
            chrom, positions = header.split(":")
            start, end = map(int, positions.split("-"))
            new_start = start - 5001 
            new_end = end + 5000
            new_header = f"{chrom}:{new_start}-{new_end}"
            flanked_sequence = flanking_seq + sequence + flanking_seq
            outfile.write(f"{new_header}\t{flanked_sequence}\n")
        else:
            print(f"Skipping improperly formatted header: {header}")

print(f"Flanked sequences written to {output_file}.")


Flanked sequences written to /mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled_flanked.txt.


## Below is the process of creating a filtered version of the canonical dataset so that it only contains relevant transcripts with motif hits.

In [9]:
motif_file = "/mnt/lareaulab/sdahiyat/datasets/srsf1_matches_shuffled.txt"
motif_keys = set()

with open(motif_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        key = line.split("\t")[0]  #"chr1:65419-71585"
        motif_keys.add(key)

# Step 2: Filter canonical_dataset_created.txt
input_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_created.txt"
output_file = "/mnt/lareaulab/sdahiyat/illumina/canonical_dataset_filtered_motifs.txt"

num_written = 0
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        parts = line.strip().split("\t")
        if len(parts) < 6:
            continue
        chrom = parts[2]
        try:
            start = int(parts[4])
            end = int(parts[5])
        except ValueError:
            continue
        coord_key = f"{chrom}:{start}-{end}"
        if coord_key in motif_keys:
            outfile.write(line)
            num_written += 1

print(f"wrote {num_written} matching entries to: {output_file}")


wrote 1930 matching entries to: /mnt/lareaulab/sdahiyat/illumina/canonical_dataset_filtered_motifs.txt
