## Generate binding scores based on a best-configuration spaced k-mer model
### (Figure S6)

In [1]:
import os
import numpy as np
import random
import pandas as pd
from Bio import SeqIO
import itertools
from sklearn.metrics import roc_auc_score, average_precision_score

### helper functions

In [2]:
def get_kmer_probs(sequences, k):
    kmer_inx = generate_kmer_inx(l=k)
    kmer_inx6 = generate_kmer_inx(l=6)
    
    kmer_counts = np.zeros(4**6)
    
    for seq in sequences:
        kmers = get_kmers_sequence(seq, k=k)
        for kmer in kmers:
            kmer_counts[kmer_inx6[kmer[:3]+kmer[-3:]]] += 1
            
    #add pseudo count
    kmer_counts = kmer_counts+1
    
    #normalize
    kmer_probs = kmer_counts/np.sum(kmer_counts)
    return kmer_probs

In [3]:
def get_kmer_ll(ps_seqs, bg_seqs, k):
    
    ps_probs = get_kmer_probs(ps_seqs, k=k)
    bg_probs = get_kmer_probs(bg_seqs, k=k)
    
    return np.log(ps_probs)-np.log(bg_probs)    
    

In [4]:
def get_best_ll(sequence, ll, kmer_inx6, k):
    kmers = get_kmers_sequence(sequence, k)
    ll_values = [ll[kmer_inx6[kmer[:3]+kmer[-3:]]] for kmer in kmers]
    return max(ll_values)

In [5]:
def rank_sequences_by_ll(sequences, ll, k):
    kmer_inx6 = generate_kmer_inx(l=6)
    ll_by_sequence = [get_best_ll(sequence, ll, kmer_inx6, k) for sequence in sequences]
    return ll_by_sequence

In [None]:
for k in range(6,12):
    file_table = pd.read_csv('../../../cbscratch/selex_taipale/ps_and_bg_file_table_first_and_last.txt', sep='\t', header=None)
    fasta_path = '../../../cbscratch/selex_taipale/fasta_split_first_and_last/'
    pred_dir = f'param/selex_nodist0_1000/selex_spaced_{k}mers'
    !mkdir -p $pred_dir

    for i in range(len(file_table)):
        factor = file_table.iloc[i,0]

        ps_file = os.path.join(fasta_path, f'{factor}_train_ps.fasta')
        ps_seqs = parse_fasta(ps_file)

        bg_file = os.path.join(fasta_path, f'{factor}_train_bg.fasta')
        bg_seqs = parse_fasta(bg_file)

        ps_file_test = os.path.join(fasta_path, f'{factor}_test_ps.fasta')
        ps_seqs_test = parse_fasta(ps_file_test)

        bg_file_test = os.path.join(fasta_path, f'{factor}_test_bg.fasta')
        bg_seqs_test = parse_fasta(bg_file_test)

        ll = get_kmer_ll(ps_seqs,bg_seqs,k=k)

        ps_ll = rank_sequences_by_ll(ps_seqs_test, ll, k=k)
        bg_ll = rank_sequences_by_ll(bg_seqs_test, ll, k=k)

        y_scores = np.hstack([ps_ll, bg_ll])

        output_prefix = os.path.join(pred_dir, factor)
        np.savetxt(fname=f'{output_prefix}.predictions', X=y_scores)