Siegel, David A., et al. "Massively parallel analysis of human 3′ UTRs reveals that AU-rich element length and registration predict mRNA destabilization." G3 12.1 (2022): jkab404.

In [20]:
import numpy as np
import pandas as pd
from textwrap import wrap

In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/siegel_2022/'

In [3]:
cell_type = 'Jurkat' #  Beas2B or Jurkat

In [4]:
df = pd.read_csv(data_dir + f'supl/sequence_level_data_{cell_type}.csv')

In [5]:
df = df[~df.parent_control_oligo.isna()] #remove variants with unknown reference

In [6]:
ref_df = df[df.iscontrol==1].set_index('ids') #reference id to index
alt_df = df[df.iscontrol==0]

ref_df = ref_df.loc[alt_df.parent_control_oligo].reset_index() #get the reference for each alternative

#now ref_df and alt_df are of the same length

In [7]:
alt_df.index = range(0,len(alt_df)*2,2) #index from 0 to -1

ref_df.index = range(1,1+len(ref_df)*2,2)#index from 1 to -1

In [8]:
ref_df.issnp = alt_df.issnp.values

In [9]:
df = pd.concat([alt_df,ref_df]).sort_index() #now alternative and reference sequences alternate

In [10]:
df.to_csv(data_dir + cell_type + '.tsv', sep='\t', index=None)

In [21]:
output_fasta = data_dir + 'fasta.fa'

In [22]:
#sequences are identical for both cell types

with open(output_fasta, 'w') as f:
    for seq_id, seq in enumerate(df.seq):
        f.write(f'>id_{seq_id}:Homo_sapiens\n')
        for line in wrap(seq, 80): #wrap sequence with standard FASTA width
            f.write(line+'\n')

#don't forget to index with samtools faidx!

In [24]:
!rm {data_dir}/sequences.tsv