Prepare data from

Slutskin, Ilya Vainberg, Adina Weinberger, and Eran Segal. "Sequence determinants of polyadenylation-mediated regulation." Genome research 29.10 (2019): 1635-1647.

# Make FASTA

make FASTA file out of sequences from supplementary table 9

In [1]:
import numpy as np
import pandas as pd
from textwrap import wrap

In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/slutskin_2019/'

In [3]:
supt9 = pd.read_csv(data_dir + 'supl/Supplemental_Table_9.tab', sep='\t', skiprows=1, dtype={'Fold':str})

In [4]:
output_fasta = data_dir + 'fasta_reversecompl.fa'

In [5]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [6]:
with open(output_fasta, 'w') as f:
    for seq_id, seq in supt9[['ID','Sequence']].values:
        f.write(f'>id_{seq_id}:Homo_sapiens\n')
        seq = reverse_complement(seq)
        for line in wrap(seq, 80): #wrap sequence with standard FASTA width
            f.write(line+'\n')

#don't forget to index with samtools faidx!

In [7]:
supt2 = pd.read_csv(data_dir + 'supl/Supplemental_Table_2.tab', sep='\t', skiprows=1, dtype={'Fold':str})
supt4 = pd.read_csv(data_dir + 'supl/Supplemental_Table_4.tab', sep='\t', skiprows=1, dtype={'Fold':str})

In [16]:
supt4[supt4.ID.isin(supt2[supt2.Source!='K562'].ID)].Pos.value_counts()

Pos
WT    572
Name: count, dtype: int64

In [6]:
supt2[supt2.ID==36277]

Unnamed: 0,ID,Source,Name,Expression
4486,36277,K562,RID6668,-8.169925


In [8]:
df = supt9[supt9.ID.isin(supt2[supt2.Source=='K562'].ID)]

In [None]:
wild_type_id = supt4[supt4.Pos=='WT'].ID
wild_type_seq = supt9[supt9.ID.isin(wild_type_id)][['ID','Fold','Sequence']]

In [16]:
wild_type_id = supt4[supt4.Pos=='WT'].ID
wild_type_seq = supt9[supt9.ID.isin(wild_type_id)][['ID','Fold','Sequence']]

In [63]:
for fold in ('0','1','2','3','4','5','6','7','8','9','10','Test'):
    wild_type_fold = wild_type_seq.drop(columns='Fold').set_index('ID').squeeze()
    fold_seqs = supt9[supt9.Fold==fold].Sequence
    for idx, seq in fold_seqs.items():
        min_diff = 10000
        for wt_ID, wt_seq in wild_type_fold.items():
            diff_counter =  is_wt(seq,wt_seq)
            min_diff = min(diff_counter,min_diff)
        supt9.at[idx,'group'] = min_diff

In [59]:
def is_wt(seq,wt_seq):

    diff_counter = 0
    flag = 0

    for c_seq, c_wt in zip(seq, wt_seq):
        if c_seq!=c_wt:
            if flag==1:
                diff_counter = 1000
                break
            diff_counter += 1
        if diff_counter>0 and c_seq==c_wt:
            flag = 1
    
    return diff_counter

In [52]:
is_wt(seq,wt_seq)

129


False

In [65]:
(supt9.group==20).sum()

0