In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
from Bio import SeqIO

In [2]:
tei_df = pd.read_csv('/home/alatypova/At_paper_2025/TEI_S1_pseudoref_ins.tsv', sep='\t')
#tei_df = tei_df.iloc[:, :14]
tei_df = tei_df[tei_df['VI']==True]
tei_df['Corrected pos'].fillna(tei_df['TEIcoord'], inplace=True)
tei_df['TEIcoord_chr'] = [n[:11] for n in tei_df['Corrected pos'].values]
tei_df['TEIcoord_start'] = [n.split(':')[1].split('..')[0] for n in tei_df['Corrected pos'].values]
tei_df['TEIcoord_end'] = [n.split(':')[1].split('..')[1] for n in tei_df['Corrected pos'].values]

In [3]:
tei_df['TEIcoord_start'] = tei_df['TEIcoord_start'].astype(int)
tei_df['TEIcoord_end'] = tei_df['TEIcoord_end'].astype(int)

tei_df = tei_df.sort_values(['sample', 'TEIcoord_chr', 'TEIcoord_start'], ascending=[True, True, True])

In [4]:
ins_list =[]
len_list = []
for pos, zygo, origin in tei_df[['Corrected pos', 'Zygoticity', 'Origin']].itertuples(index=False):
    ins = SeqIO.to_dict(SeqIO.parse(f"assembled_te/{origin}_insertions.fasta", "fasta"))[pos+'-'+str(zygo)].seq
    ins_list.append(ins)
    len_list.append(len(ins))

In [6]:
tei_df['Insertion_seq'] = ins_list
tei_df['Insertion_len'] = len_list

In [None]:
fasta = SeqIO.to_dict(SeqIO.parse("/home/alatypova/At_paper_2025/TAIR10_1.fna", "fasta"))
# repeat for each chr and sample
for sample in tei_df['sample'].unique():
    sample_fasta = SeqIO.to_dict(SeqIO.parse("/home/alatypova/At_paper_2025/TAIR10_1.fna", "fasta"))
    shift = 0
    # TEI for each sample
    #positions = tei_df[tei_df['sample']==sample][['TE.chr', 'TE.start', 'TE.end', 'TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end']].copy()
    positions = tei_df[tei_df['sample']==sample][['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'Insertion_seq']].copy()
    # for origin in positions['Origin'].unique():
    #     fasta_te = SeqIO.to_dict(SeqIO.parse(f"/home/alatypova/At_paper_2025/assembled_te/{origin}_insertions.fasta", "fasta"))


    for chr_name in list(fasta.keys()):
        # TE sequences to insert in each chr
        #seq_list = [fasta[name].seq[te_start:te_end] for name, te_start, te_end in positions[positions['TEIcoord_chr']==chr_name][['TE.chr', 'TE.start', 'TE.end']].to_numpy()]
        shift = 0
        #print([len(i) for i in seq_list])

        for i, (start, end, ins_seq) in enumerate(positions[positions['TEIcoord_chr']==chr_name][['TEIcoord_start', 'TEIcoord_end', 'Insertion_seq']].to_numpy()):
            start, end = int(start), int(end)
            # insert seq_list[i] and TSD into target pos
            #insertion = seq_list[i] + sample_fasta[chr_name].seq[start+shift:end+shift]
            insertion = ins_seq + sample_fasta[chr_name].seq[start+shift:end+shift]

            sample_fasta[chr_name].seq = sample_fasta[chr_name].seq[:end+shift] + insertion + sample_fasta[chr_name].seq[end+shift:]
            #print('a', len(sample_fasta[chr_name].seq))
            shift += len(insertion)
        #print(sample, chr_name, len(sample_fasta[chr_name].seq))
    shift = 0
    SeqIO.write(sample_fasta.values(), f"/home/alatypova/At_paper_2025/pseudoreference/TAIR10_1_{sample}.fasta", "fasta")

In [9]:
gff = pd.read_csv('/home/alatypova/At_paper_2025/Araport11_GTF_genes_transposons.current.fixed.sorted.gff', sep='\t', skiprows=10, 
            names=['chr', 'source', 'type', 'start', 'end', 'val', 'strand', 'e', 'descr'])
gff = gff[gff['chr'] != '###']
gff[['start', 'end']] = gff[['start', 'end']].astype(int)

# repeat for each chr and sample
for sample in tei_df['sample'].unique():
    shift = 0

    sample_gff = gff.copy()
    # TEI for each sample
    positions = tei_df[tei_df['sample']==sample][['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].copy()
    novel_tes = positions[['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].copy()
    novel_tes['def'] = novel_tes['TEIcoord_chr'].astype(str) + ':' + novel_tes['TEIcoord_start'].astype(str) + '..' + novel_tes['TEIcoord_end'].astype(str)

    for chr_name in gff.chr.unique():
        shift = 0

        for i, (tsd_start, tsd_end, te_len) in enumerate(positions[positions['TEIcoord_chr']==chr_name][['TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].to_numpy()):
            # update insertion positions

            #te_start, te_end = int(te_start), int(te_end)

            #print('a', te_end - te_start)
            novel_tes.loc[(novel_tes['TEIcoord_chr']==chr_name) & (novel_tes['TEIcoord_start']==tsd_start), 'TEIcoord_start'] = int(tsd_end) + shift
            novel_tes.loc[(novel_tes['TEIcoord_chr']==chr_name) & (novel_tes['TEIcoord_end']==tsd_end), 'TEIcoord_end'] = int(tsd_end) + int(te_len) + shift

            #tsd_start, tsd_end = int(tsd_start) + shift, int(tsd_end) + shift

            #  TE and TSD
            insertion = int(te_len) + int(tsd_end) - int(tsd_start) 

            # update all gff pos after TSD end
            sample_gff.loc[(sample_gff['chr']==chr_name) & (sample_gff.end.gt(int(tsd_end) + shift)), 'end'] += insertion
            sample_gff.loc[(sample_gff['chr']==chr_name) & (sample_gff.start.gt(int(tsd_end) + shift)), 'start'] += insertion

            # update shift 
            shift += insertion
            #print(te_end, te_start, tsd_end, tsd_start, shift)

        shift = 0

        #print(sample, chr_name, shift)
    sample_gff.to_csv(f"/home/alatypova/At_paper_2025/pseudoreference/Araport11_GTF_genes_transposons_{sample}.gff", index=False, header=False, sep='\t')
    novel_tes[['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'def']].to_csv(f"/home/alatypova/At_paper_2025/pseudoreference/novel_TEI_{sample}.bed", index=False, header=False, sep='\t')

In [11]:
gff = pd.read_csv('/home/alatypova/At_paper_2025/Araport11_GFF3_transposons.current.fixed.gff', sep='\t', skiprows=10, 
            names=['chr', 'source', 'type', 'start', 'end', 'val', 'strand', 'e', 'descr'])
gff = gff[gff['chr'] != '###']
gff[['start', 'end']] = gff[['start', 'end']].astype(int)

# repeat for each chr and sample
for sample in tei_df['sample'].unique():
    shift = 0

    sample_gff = gff.copy()
    # TEI for each sample
    positions = tei_df[tei_df['sample']==sample][['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].copy()
    novel_tes = positions[['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].copy()
    novel_tes['def'] = novel_tes['TEIcoord_chr'].astype(str) + ':' + novel_tes['TEIcoord_start'].astype(str) + '..' + novel_tes['TEIcoord_end'].astype(str)

    for chr_name in gff.chr.unique():
        shift = 0

        for i, (tsd_start, tsd_end, te_len) in enumerate(positions[positions['TEIcoord_chr']==chr_name][['TEIcoord_start', 'TEIcoord_end', 'Insertion_len']].to_numpy()):
            # update insertion positions

            #te_start, te_end = int(te_start), int(te_end)

            #print('a', te_end - te_start)
            novel_tes.loc[(novel_tes['TEIcoord_chr']==chr_name) & (novel_tes['TEIcoord_start']==tsd_start), 'TEIcoord_start'] = int(tsd_end) + shift
            novel_tes.loc[(novel_tes['TEIcoord_chr']==chr_name) & (novel_tes['TEIcoord_end']==tsd_end), 'TEIcoord_end'] = int(tsd_end) + int(te_len) + shift

            #tsd_start, tsd_end = int(tsd_start) + shift, int(tsd_end) + shift

            #  TE and TSD
            insertion = int(te_len) + int(tsd_end) - int(tsd_start) 

            # update all gff pos after TSD end
            sample_gff.loc[(sample_gff['chr']==chr_name) & (sample_gff.end.gt(int(tsd_end) + shift)), 'end'] += insertion
            sample_gff.loc[(sample_gff['chr']==chr_name) & (sample_gff.start.gt(int(tsd_end) + shift)), 'start'] += insertion

            # update shift 
            shift += insertion
            #print(te_end, te_start, tsd_end, tsd_start, shift)

        shift = 0

        #print(sample, chr_name, shift)
    sample_gff.to_csv(f"/home/alatypova/At_paper_2025/pseudoreference/Araport11_GFF3_transposons_{sample}.gff", index=False, header=False, sep='\t')
    #novel_tes[['TEIcoord_chr', 'TEIcoord_start', 'TEIcoord_end', 'def']].to_csv(f"/home/alatypova/At_paper_2025/pseudoreference/novel_TEI_{sample}.bed", index=False, header=False, sep='\t')