## Load Python libraries and define functions
- Links included for instructions on how to install libraries using pip or conda (if libraries not included with conda)

In [None]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqRecord import SeqRecord

import pandas as pd
import numpy as np
import os
import random
import pickle
import gzip
import re
import fastapy    # https://github.com/aziele/fastapy

import kaleido    # https://github.com/plotly/Kaleido    # for image export: conda install python-kaleido==0.1.0
import plotly     # https://plotly.com/python/getting-started/
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = "none"

In [None]:
bases = ['A', 'T', 'G', 'C']
def reverse_complement(dna):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    return ''.join([complement[base] for base in dna[::-1]]) 
def repeat_frames_RC(input_seq):
    return list(pd.Series([''.join(input_seq*2)[i:len(input_seq)+i] for i in range(len(input_seq))] + [reverse_complement(seq) for seq in [''.join(input_seq*2)[i:len(input_seq)+i] for i in range(len(input_seq))]]).sort_values().drop_duplicates())

## Generate repeat length distributions

In [None]:
repeats_1_4 = ['A', 'C', 'AC', 'AT', 'AG', 'CG', 'AAT', 'AAG', 'AAC', 'ATC', 'ACT', 'AGG', 'AGC', 'ACG', 'ACC', 'CCG', 'AAAT', 'AAAG', 'AAAC', 'AATG', 'AATC', 'AAGT', 'AAGG', 'AAGC', 'AACT', 'AACG', 'AACC', 'AGAT', 'ACAT', 'ATCC', 'ACAG', 'ACTC', 'ACTG', 'ACCT', 'AGGG', 'AGGC', 'AGCC', 'ACGG', 'ACGC', 'ACCG', 'ACCC', 'AGCG', 'CCCG', 'AATT', 'ATGC', 'ATCG', 'AGCT', 'ACGT', 'CCGG']

In [None]:
def count_all_repeats_per_seq(input_seq, repeat_list = repeats_1_4):    
    input_seq = input_seq.upper()
    unit_lengths = list(set([len(rep) for rep in repeat_list]))
    seq_series = dict()
    if 1 in unit_lengths:
        seq_series[1] = pd.Series(range(len(input_seq)), list(input_seq))
    unit_lengths = [rep for rep in unit_lengths if rep != 1]
    if len(unit_lengths) > 0:
        for unit in unit_lengths:
            seq = re.findall('.'*unit, input_seq); seq_series[unit] = pd.Series(range(len(seq)), index = seq)
    counts = dict(); counts['A'] = dict(); counts['B'] = dict()
    counter = 0
    for repeat in repeat_list:
        counts['A'][repeat] = dict(); counts['B'][repeat] = dict()
        for motif in repeat_frames_RC(repeat):
            counter +=1
            print('\r' + str(counter)+'/'+str(pd.Series([len(repeat_frames_RC(repeat)) for repeat in repeat_list]).sum())+': '+motif, end = '    ')
            if motif in seq_series[len(motif)].index:
                try:
                    rep_pos_index = seq_series[len(motif)].loc[motif].values
                    pos_list = pd.DataFrame(rep_pos_index[np.where(rep_pos_index != np.roll(rep_pos_index, 1) +1)[0]], columns = ['start'])
                    pos_list['end'] = rep_pos_index[np.where(rep_pos_index != np.roll(rep_pos_index, -1) -1)[0]]
                    pos_list['length'] = pos_list['end'] - pos_list['start'] + 1
                    counts['A'][repeat][motif] = pos_list['length'].value_counts()
                    pos_list['length_B'] = pos_list['start'] - (pos_list['end'].shift(1).fillna(-1).astype(int) +1)
                    counts['B'][repeat][motif] = pos_list['length_B'].value_counts()
                except Exception:
                    pass
        try: # Combine all variants of a motif into a single count
            counts['A'][repeat] = pd.concat(counts['A'][repeat], axis=1).sum(axis=1)
        except Exception:
            del counts['A'][repeat]    
        try:
            counts['B'][repeat] = pd.concat(counts['B'][repeat], axis=1).sum(axis=1)
        except Exception:
            del counts['B'][repeat]   
    counts['A'] = pd.concat(counts['A'], axis=1)
    counts['B'] = pd.concat(counts['B'], axis=1)
    counts = pd.concat(counts, axis=1)
    return counts.fillna(0)

In [None]:
def count_genome(genome_file, genomes_path = 'genomes/', output_path = 'repeat_distributions/', output_name = False, repeat_list = repeats_1_4, shuffle = False):
    genome_counts = dict(); genome_counts_sub10k = dict()
    if output_name == False:
        output_name = genome_file[:-3]
    records = list(fastapy.parse(genomes_path+genome_file))
    record_len = pd.Series([len(rec) for rec in records])
    complete_nt = 0; record_sum = record_len.sum()
    short_records = pd.Series(records).reindex(record_len.loc[record_len < 10000].index)
    records = pd.Series(records).reindex(record_len.loc[record_len >= 10000].index).to_list()
    for record in records:
        current_record = record.seq
        if shuffle == True:
            current_record = ''.join(random.sample(current_record, len(current_record)))            
        genome_counts[record.id] = count_all_repeats_per_seq(current_record, repeat_list = repeat_list)
        complete_nt += len(record)
        print('\r' + '                     finished ' + str(round((complete_nt* 100)/record_sum, 2))+ '%', end = '         ')
    if len(short_records) > 0:
        short_seq = 'N'.join([seq.seq for seq in short_records]).upper()
        genome_counts_sub10k = count_all_repeats_per_seq(short_seq, repeat_list = repeat_list)
        complete_nt += len(short_seq)
        print('\r' + '                     finished ' + str(round((complete_nt* 100)/record_sum, 2))+ '%', end = '         ')
    genome_counts = pd.concat(genome_counts).groupby(level=[1]).sum()
    if len(genome_counts_sub10k) > 0:
        genome_counts_all = dict()
        genome_counts_all['long'] = genome_counts
        genome_counts_all['short'] = genome_counts_sub10k
        genome_counts_all = pd.concat(genome_counts_all)
        genome_counts_all.to_pickle(output_path+output_name+'.pickle')
    else:
        genome_counts.to_pickle(output_path+output_name+'.pickle')

#### T2T-CHM13 genome
- T2T v2.0
- download "hs1.fa.gz" from http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/
- also available from https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_009914755.1/

In [None]:
count_genome('hs1.fa.gz', output_name = 'CHM13_counts')

In [None]:
# shuffled genome distribution
count_genome('hs1.fa.gz', output_name = 'random_counts', shuffle = True)

### CHM13 repeat length distribution plots

In [None]:
CHM13_counts = pd.read_pickle('repeat_distributions/CHM13_counts.pickle').sort_index()
random_counts = pd.read_pickle('repeat_distributions/random_counts.pickle').sort_index()

In [None]:
counts_by_unit_length = dict()
for i in range(1,5):
    counts_by_unit_length[i] = dict()
for motif in CHM13_counts['A'].columns[:-1]:
    counts_by_unit_length[len(motif)][motif] = CHM13_counts['A'][motif]
for i in range(1,5):
    counts_by_unit_length[i] = pd.concat(counts_by_unit_length[i], axis=1)
    counts_by_unit_length[i]['all'] = counts_by_unit_length[i].sum(axis=1)

In [None]:
counts_random_by_unit_length = dict()
for i in range(1,5):
    counts_random_by_unit_length[i] = dict()
for motif in random_counts['A'].columns[:-1]:
    counts_random_by_unit_length[len(motif)][motif] = random_counts['A'][motif]
for i in range(1,5):
    counts_random_by_unit_length[i] = pd.concat(counts_random_by_unit_length[i], axis=1)
    counts_random_by_unit_length[i]['all'] = counts_random_by_unit_length[i].sum(axis=1)

In [None]:
# Fig. S1a (by nt)
fig_starting_counts = go.Figure()
legendnames = []
for motif in CHM13_counts['A'].columns[:-1]:
    fig_starting_counts.add_trace(go.Scatter(x = CHM13_counts['A'][motif].index * len(motif), y = CHM13_counts['A'][motif].replace(0, np.nan), name = 'unit length = '+str(len(motif)), text = motif, legendgroup = len(motif), showlegend = False if len(motif) in legendnames else True, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[len(motif)-1]), mode = 'lines', opacity = 0.95/(1+0.5*len(motif))))
    legendnames.append(len(motif))
fig_starting_counts.update_xaxes(type = 'log', title = 'repeat length (nt)', range = [0,2.1], gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_starting_counts.update_yaxes(type = 'log', title = 'repeat counts', tickformat = '1.0e', dtick = 2, range = [0,10], gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_starting_counts.update_layout(font=dict(family = 'Arial', size = 14), legend = dict(yanchor="top", y=0.99, xanchor="right", x=1.2), height = 300, width = 580, margin={'t':40,'l':80,'b':40,'r':60}, )   
fig_starting_counts.show()

In [None]:
fig_starting_counts.write_image('plots/figS1a.svg')
fig_starting_counts.write_image('plots/figS1a.pdf')

In [None]:
# Fig. 1a
fig_starting_counts = go.Figure()
for i in range(1,5):
    fig_starting_counts.add_trace(go.Scatter(x = counts_by_unit_length[i].index * i, y = counts_by_unit_length[i]['all'].replace(0,np.nan), connectgaps = True, legendgroup = i, name = 'unit length = '+str(i), line = dict(width = 4), mode = 'lines', opacity = 0.75))
for i in range(1,5):
    fig_starting_counts.add_trace(go.Scatter(x = counts_random_by_unit_length[i].index * i, y = counts_random_by_unit_length[i]['all'].replace(0,np.nan), connectgaps = True, legendgroup = i, showlegend = False, name = 'random, unit length = '+str(i), line = dict(width = 3, dash = '5'), mode = 'lines', opacity = 0.55))

fig_starting_counts.update_xaxes(type = 'log', title = 'repeat length (nt)', range = [0,3], gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_starting_counts.update_yaxes(type = 'log', title = 'counts', tickformat = '1.0e', dtick = 2, gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_starting_counts.update_layout(font=dict(family = 'Arial', size = 16), legend = dict(yanchor="top", y=0.99, xanchor="right", x=0.99), height = 300, width = 500, margin={'t':40,'l':80,'b':40,'r':10}, colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:4])
fig_starting_counts.show()

In [None]:
fig_starting_counts.write_image('plots/fig1a.svg')
fig_starting_counts.write_image('plots/fig1a.pdf')

### bootstrap counting

In [None]:
def count_genome_bootstrap(genome_file, genomes_path = 'genomes/', output_path = 'repeat_distributions/', output_name = False, repeat_list = repeats_1_4):
    genome_counts = dict()
    if output_name == False:
        output_name = genome_file[:-3]
    records = list(fastapy.parse(genomes_path+genome_file))
    record_len = pd.Series([len(rec) for rec in records])
    complete_nt = 0; record_sum = record_len.sum()
    for record in records:
        counter = 0
        for chunk in re.findall('.{1,1000000}', record.seq):
            if len(chunk) == 1000000:
                genome_counts[record.id + '_' + str(counter)] = count_all_repeats_per_seq(chunk, repeat_list = repeat_list)
                counter +=1
        complete_nt += len(record)
        print('\r' + '                     finished ' + str(round((complete_nt* 100)/record_sum, 2))+ '%', end = '         ')
    genome_counts = pd.concat(genome_counts)
    genome_counts.to_pickle(output_path+output_name+'_bootstrap.pickle')

In [None]:
count_genome_bootstrap('hs1.fa.gz', output_name = 'CHM13_counts')

In [None]:
bootstrap_CHM13_counts = pd.read_pickle('repeat_distributions/CHM13_counts_bootstrap.pickle').sort_index()

In [None]:
bootstrap_counts = dict()
subindex = bootstrap_CHM13_counts['A'].index.get_level_values(0)
bootstrap_counts = dict()
for i in range(1000):
    print('\r' + str(i), end = '   ')
    sample_ids = np.random.choice(subindex, 3117, replace=True)
    bootstrap_main = bootstrap_CHM13_counts['A'].loc[list(sample_ids)].groupby(level=[1]).sum().replace(0, np.nan).dropna(how = 'all', axis=0)
    remainder = np.random.choice(subindex, 1, replace=True)
    remainder = bootstrap_CHM13_counts['A'].loc[sample_ids[0]].copy().replace(0, np.nan).dropna(how = 'all', axis=0)      # remove this later?
    remainder = (remainder * (275501 / remainder.mul(remainder.index, axis=0).sum())).round()   # remove this later?
    bootstrap_counts[i] = bootstrap_main.add(remainder, fill_value = 0)
bootstrap_counts = pd.concat(bootstrap_counts, axis=1).sort_index()
bootstrap_counts.to_pickle('repeat_distributions/bootstrap_counts_1000.pickle')

In [None]:
bootstrap_counts = pd.read_pickle('repeat_distributions/bootstrap_counts_1000.pickle')

### Mammalian genomes
- download link: https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=40674&reference_only=true&typical_only=true&assembly_level=2:3&release_year=2019:2023
- contains 213 genomes as of 5/2024 (includes human hg38)
- retrieve two additional human genome assemblies: GCA_015074485.1_ASM1507448v1_genomic.fna, GCA_003112815.1_ASM311281v1_genomic.fna
- to save space, recommended to download fasta files, uncompress .zip container file, then re-compress each fasta file using gzip

In [None]:
# note: replace file paths where necessary
genomes_path_mammals = '../../non_human_genomes/mammalian_reference_genomes/compressed/'
genomes = os.listdir(genomes_path_mammals)
output_path_mammals = 'repeat_distributions/mammalian_reference_genomes/'

In [None]:
counter = 0
for genome_file in genomes:
    print(str(counter) + '/'+str(len(genomes)) + ': ' + genome_file[:-3])
    if genome_file[:-3]+'.pickle' not in os.listdir(output_path_mammals):
        count_genome(genome_file, genomes_path = genomes_path_mammals, output_path = output_path_mammals)
    counter +=1

#### load distributions

In [None]:
completed = [file for file in os.listdir('repeat_distributions/mammalian_reference_genomes/') if file.startswith('GCA')]
len(completed), len(genomes)

In [None]:
counts_all = dict()
for file in completed:
    if file[:-15] not in counts_all.keys():
        counts_all[file[:-15]] = pd.read_pickle('repeat_distributions/mammalian_reference_genomes/'+file)['A'].replace(0, np.nan).dropna(how = 'all')
        if 'long' in counts_all[file[:-15]].index.get_level_values(0):
            counts_all[file[:-15]] = counts_all[file[:-15]].loc['long']
counts_all = pd.concat(counts_all)

In [None]:
counts_all

#### genome info
- note: "data_summary.tsv" file included with download of multiple genome files from NCBI
- in my experience, some info was missing and had to be filled in
- download taxonomy file from: https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip

In [None]:
genome_info = pd.read_csv('repeat_distributions/mammalian_reference_genomes/data_summary.tsv', sep = '\t')
# Fill in some missing data
genome_info.loc[genome_info['Organism Common Name'].isna(), 'Organism Common Name'] = ['oldfield mouse', 'Masai giraffe', 'Nubian giraffe', 'South-central black rhinoceros', 'Marco Polo sheep (hybrid)', 'Indian elephant', 'Tibetan macaque', 'Fringe-lipped bat', 'Brazilian porcupine', 'Marco Polo sheep', 'Ground cuscus', 'East African Hippopotamus', 'Intermediate roundleaf bat', 'Eastern spiny mouse']

genome_info['Assembly Name'] = genome_info['Assembly Name'].str.replace(' ', '_').str.replace('+', '_')
genome_info.loc[129, 'Assembly Name'] = 'mEubGla1.1.hap2._XY'

In [None]:
taxonomy = pd.read_csv('repeat_distributions/mammalian_reference_genomes/NCBI_taxonomy/rankedlineage.dmp', sep = '\t', usecols = [0,2,6,8,10,12,14,16,18], header = None, low_memory=False)
taxonomy.columns = ['tax_id', 'tax_name', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom']
taxonomy_present = taxonomy.set_index(['tax_id']).reindex(genome_info['Taxonomy id'])
lineage = pd.read_csv('repeat_distributions/mammalian_reference_genomes/NCBI_taxonomy/taxidlineage.dmp', sep = '\t', header = None, usecols = [0,2])
lineage.columns = ['tax_id', 'lineage_ids']

subgroup_list = pd.Series(['order', 'family', 'genus', 'tax_name'], index = ['class', 'order', 'family', 'genus'])

In [None]:
genome_info_by_filename = genome_info.set_index(genome_info['Assembly Accession'] + '_' + genome_info['Assembly Name'])
genome_info_by_filename = pd.concat([genome_info_by_filename, taxonomy_present.drop_duplicates().reindex(genome_info_by_filename['Taxonomy id']).set_index(genome_info_by_filename.index)], axis=1)
genome_info_by_filename = genome_info_by_filename.reindex(counts_all.index.levels[0])

In [None]:
genome_info_by_filename[:40]

#### Plots

In [None]:
def plot_counts_grouped(motif = 'A', group_list = ['class'], select_list = ['Mammalia'], x_nt = True, show = False, cutoff_n = 30, show_median = 1):
    fig_counts = go.Figure()
    counter = 0
    for group, select in zip(group_list, select_list):
        current_set = genome_info_by_filename.loc[genome_info_by_filename[group] == select].copy()
        current_group = current_set.index
        current_counts = counts_all[motif].unstack().transpose().reindex(current_group, axis=1).replace(0, np.nan).dropna(how = 'all', axis=1).fillna(0)
        #current_counts.columns = list(range(len(current_counts.columns)))
        current_counts = current_counts[current_counts >1].fillna(1,limit=1, axis=0)
        current_counts.loc[len(current_counts)+1] = np.nan
        for col in current_counts:
            current_col = current_counts[col].copy()
            current_col = current_col.reindex(list(range(1, (current_col > cutoff_n).idxmin())))
            current_col = current_col.div(current_col.sum())
            fig_counts.add_trace(go.Scatter(x = current_col.index * len(motif) if x_nt == True else current_col.index, y = current_col, connectgaps = False, name = select, hovertemplate= 'L=%{x}, n=%{y}' + '<br>' + '<b>'+ current_set['Organism Common Name'].loc[col] + '</b><br>' + 'Order: ' + current_set['order'].loc[col] + '<br>' + 'Family: ' + current_set['family'].loc[col] + '<br>' + current_set['Organism Scientific Name'].loc[col] + '<br>' + "<extra></extra>", opacity = min(0.25, 15/len(current_set.index)), line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[counter], width = 1), legendgroup = counter, showlegend = False))
        counter +=1
    counter = 0
    for group, select in zip(group_list, select_list):
        current_set = genome_info_by_filename.loc[genome_info_by_filename[group] == select].copy()
        current_group = current_set.index
        current_counts = counts_all[motif].unstack().transpose().reindex(current_group, axis=1).replace(0, np.nan).dropna(how = 'all', axis=1).fillna(0)
        current_counts = current_counts.reindex(list(range(1,1001)))
#        current_counts = current_counts.reindex(list(range(1, current_counts.loc[current_counts>30].idxmin())))
        current_med = (current_counts / current_counts.sum()).median(axis=1)
        current_med = current_med.loc[current_med > 1e-9]
        fig_counts.add_trace(go.Scatter(x = current_med.index * len(motif) if x_nt == True else current_med.index, y = current_med, connectgaps = True, opacity = show_median, name = select, line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[counter], width = 3), legendgroup = counter))
        counter +=1
    fig_counts.update_xaxes(type = 'log', title = 'repeat length (nt)' if x_nt == True else 'repeat length (units)', range = [0,3], gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
    fig_counts.update_yaxes(type = 'log', tickformat = '1.0e', dtick = 2, title = 'frequency', gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)   
    fig_counts.update_layout(font=dict(family = 'Arial', size = 16), height = 300, width = 500, margin={'t':40,'l':80,'b':40,'r':10})        
    if show == True:
        fig_counts.show()
    return fig_counts

In [None]:
# Fig. 1b
fig_grouped = plot_counts_grouped(cutoff_n = 30, group_list = ['class', 'order', 'family'], select_list = ['Mammalia', 'Primates', 'Hominidae'], show = False)
fig_grouped.update_xaxes(range = [0,2.5])
fig_grouped.update_layout(legend = dict(yanchor="top", y=0.99, xanchor="right", x=0.99))

In [None]:
fig_grouped.write_image('plots/fig1b.pdf')
fig_grouped.write_image('plots/fig1b.svg')

In [None]:
def plot_counts_grouped_multi(n_rows = 1, n_cols = 4, motif_list = repeats_1_4[:4], x_nt = True, group_list = ['class'], select_list = ['Mammalia'], chrom = 'only_CM', show = False, cutoff_n = 10
                            , show_median = 1):
    fig_counts = make_subplots(rows = n_rows, cols = n_cols, subplot_titles = motif_list, x_title = 'repeat length (nt)', y_title = 'frequency', vertical_spacing = 0.24 / n_rows, horizontal_spacing = 0.2 / n_cols, shared_xaxes = True, shared_yaxes = True)
    row_counter = 1; col_counter = 0; legend_counter = 1
    for motif in motif_list:
        counter = 0; col_counter +=1
        for group, select in zip(group_list, select_list):
            current_set = genome_info_by_filename.loc[genome_info_by_filename[group] == select].copy()
            current_group = current_set.index
            current_counts = counts_all[motif].unstack().transpose().reindex(current_group, axis=1).replace(0, np.nan).dropna(how = 'all', axis=1).fillna(0)
            current_counts = current_counts[current_counts >1].fillna(1,limit=1, axis=0)
            current_counts.loc[len(current_counts)+1] = np.nan
            for col in current_counts:
                current_col = current_counts[col].copy()
                current_col = current_col.reindex(list(range(1, (current_col > cutoff_n).idxmin())))
                current_col = current_col.div(current_col.sum())
                fig_counts.add_trace(go.Scatter(x = current_col.index * len(motif) if x_nt == True else current_col.index, y = current_col, connectgaps = False, name = select, hovertemplate= 'L=%{x}, n=%{y}' + '<br>' + '<b>'+ current_set['Organism Common Name'].loc[col] + '</b><br>' + 'Order: ' + current_set['order'].loc[col] + '<br>' + 'Family: ' + current_set['family'].loc[col] + '<br>' + current_set['Organism Scientific Name'].loc[col] + '<br>' + "<extra></extra>", opacity = min(0.25, 15/len(current_set.index)), mode = 'lines', line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[counter], width = 0.5), legendgroup = counter, showlegend = False), row = row_counter, col = col_counter)
            counter +=1
        legend_counter +=1
        if col_counter == n_cols:
            col_counter -= n_cols; row_counter +=1
    row_counter = 1; col_counter = 0; legend_counter = 1
    for motif in motif_list:
        counter = 0; col_counter +=1
        for group, select in zip(group_list, select_list):
            current_set = genome_info_by_filename.loc[genome_info_by_filename[group] == select].copy()
            current_group = current_set.index
            current_counts = counts_all[motif].unstack().transpose().reindex(current_group, axis=1).replace(0, np.nan).dropna(how = 'all', axis=1).fillna(0)
            current_counts = current_counts.reindex(list(range(1,1001)))
            current_med = (current_counts / current_counts.sum()).median(axis=1)
            current_med = current_med.loc[current_med > 1e-9]
            fig_counts.add_trace(go.Scatter(x = current_med.index * len(motif) if x_nt == True else current_med.index, y = current_med, connectgaps = True, opacity = show_median, name = select, mode = 'lines', line = dict(color = plotly.colors.DEFAULT_PLOTLY_COLORS[counter], width = 1.5), legendgroup = counter, showlegend = False), row = row_counter, col = col_counter)
            counter +=1
        legend_counter +=1
        if col_counter == n_cols:
            col_counter -= n_cols; row_counter +=1   
    fig_counts.update_xaxes(type = 'log', range = [0,2.5], dtick = 1, gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
    fig_counts.update_yaxes(type = 'log', range = [-10,0], tickformat = '1.0e', dtick = 3, gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
    fig_counts.update_layout(font=dict(family = 'Helvetica', size = 12), height = 180 * n_rows, width = 220 * n_cols, margin={'t':40,'l':60,'b':55,'r':30})        
    if show == True:
        fig_counts.show()
    return fig_counts

In [None]:
fig_grouped = plot_counts_grouped_multi(group_list = ['class', 'order', 'family'], select_list = ['Mammalia', 'Primates', 'Hominidae'], show = True)

In [None]:
# Fig S2
for n in range(len(repeats_1_4[::4])):
    print(n, end = ' ')
    current_list = repeats_1_4[n*4:n*4+4]
    fig_grouped = plot_counts_grouped_multi(motif_list = current_list, group_list = ['class', 'order', 'family'], select_list = ['Mammalia', 'Primates', 'Hominidae'], show = False)
    fig_grouped.update_layout(legend=dict(orientation = 'h', yanchor='bottom', y=-0.1, xanchor='right', x=1.04))
    fig_grouped.write_image('plots/fig_S2_'+str(n)+'.pdf')

#### Human genome assemblies

In [None]:
genome_info = pd.read_csv('repeat_distributions/mammalian_reference_genomes/data_summary.tsv', sep = '\t')
# Fill in some missing data
genome_info.loc[genome_info['Organism Common Name'].isna(), 'Organism Common Name'] = ['oldfield mouse', 'Masai giraffe', 'Nubian giraffe', 'South-central black rhinoceros', 'Marco Polo sheep (hybrid)', 'Indian elephant', 'Tibetan macaque', 'Fringe-lipped bat', 'Brazilian porcupine', 'Marco Polo sheep', 'Ground cuscus', 'East African Hippopotamus', 'Intermediate roundleaf bat', 'Eastern spiny mouse']
genome_info['Assembly Name'] = genome_info['Assembly Name'].str.replace(' ', '_').str.replace('+', '_')
genome_info.loc[129, 'Assembly Name'] = 'mEubGla1.1.hap2._XY'
genome_info_by_filename = genome_info.set_index(genome_info['Assembly Accession'] + '_' + genome_info['Assembly Name'])
genome_info_by_filename = pd.concat([genome_info_by_filename, taxonomy_present.drop_duplicates().reindex(genome_info_by_filename['Taxonomy id']).set_index(genome_info_by_filename.index)], axis=1)
# add in info for human genome assemblies
for col in ['Organism Scientific Name', 'Taxonomy id', 'tax_name', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom']:
    genome_info_by_filename.loc['GCA_015074485.1_ASM1507448v1', col] = genome_info_by_filename.loc['GCA_000001405.29_GRCh38.p14', col]
    genome_info_by_filename.loc['GCA_003112815.1_ASM311281v1', col] = genome_info_by_filename.loc['GCA_000001405.29_GRCh38.p14', col]

genome_info_by_filename.loc['GCA_000001405.29_GRCh38.p14', 'Organism Common Name'] = 'GRCH38'
genome_info_by_filename.loc['GCA_015074485.1_ASM1507448v1', 'Organism Common Name'] = 'short read (NovaSeq)'
genome_info_by_filename.loc['GCA_003112815.1_ASM311281v1', 'Organism Common Name'] = 'short read (HiSeq)'

genome_info_by_filename.loc['GCA_015074485.1_ASM1507448v1', 'Assembly Name'] = 'ASM1507448v1'
genome_info_by_filename.loc['GCA_003112815.1_ASM311281v1', 'Assembly Name'] = 'ASM311281v1'

genome_info_by_filename.loc['GCA_015074485.1_ASM1507448v1', 'Assembly Accession'] = 'GCA_015074485.1'
genome_info_by_filename.loc['GCA_003112815.1_ASM311281v1', 'Assembly Accession'] = 'GCA_003112815.1'

genome_info_by_filename['species'] = genome_info_by_filename['tax_name'].str.split(' ', expand = True)[1]
genome_info_by_filename = genome_info_by_filename.reindex(counts_all.index.levels[0])

In [None]:
# Fig. S1c
current_set = genome_info_by_filename.loc[genome_info_by_filename['genus'] == 'Homo']
opacity = 0.8
fig_counts = make_subplots(rows = 4, cols = 3, subplot_titles = ['A', 'C', 'AC', 'AT', 'AG', 'CG', 'AAC', 'AAG', 'AAT', 'CCG', 'AGC', 'AGG'], x_title = 'repeat length (nt)', y_title = 'repeat counts (normalized)', vertical_spacing = 0.06, horizontal_spacing = 0.05, shared_xaxes = True, shared_yaxes = True)
row_counter = 1; col_counter = 0; legend_counter = 0
for seq in ['A', 'C', 'AC', 'AT', 'AG', 'CG', 'AAC', 'AAG', 'AAT', 'CCG', 'AGC', 'AGG']:
    col_counter +=1; legend_counter +=1
    current = CHM13_counts['A'][seq] / (CHM13_counts['A'][seq].index * CHM13_counts['A'][seq]).sum()
    current = current.loc[current > 1e-10]
    fig_counts.add_trace(go.Scatter(x = current.index * len(seq), y = current, name = 'T2T-CHM13', hovertemplate= 'L=%{x}, n=%{y}' + '<br>' + '<b>'+ 'T2T-CHM13' + '</b><br>' + 'Order: ' + current_set['order'].loc['GCA_000001405.29_GRCh38.p14'] + '<br>' + 'Family: ' + current_set['family'].loc['GCA_000001405.29_GRCh38.p14'] + '<br>' + current_set['Organism Scientific Name'].loc['GCA_000001405.29_GRCh38.p14'] + ' (T2T)' + '<br>' + "<extra></extra>", mode = 'lines', legendgroup = 'T2T', showlegend = True if legend_counter == 1 else False, opacity = opacity), row = row_counter, col = col_counter)
    for file in current_set.index:
        current = counts_all[seq].loc[file].replace(0, np.nan).dropna().fillna(0)
        current = current / current.sum()
        current = current.loc[current > 1e-10]
        fig_counts.add_trace(go.Scatter(x = current.index * len(seq), y = current, name = current_set['Organism Common Name'].loc[file], hovertemplate= 'L=%{x}, n=%{y}' + '<br>' + '<b>' + current_set['Organism Common Name'].loc[file] + '</b><br>' + 'Order: ' + current_set['order'].loc[file] + '<br>' + 'Family: ' + current_set['family'].loc[file] + '<br>' + current_set['Organism Scientific Name'].loc[file] + '<br>' + "<extra></extra>", mode = 'lines', legendgroup = file,  showlegend = True if legend_counter == 1 else False, opacity = opacity), row = row_counter, col = col_counter)
    if col_counter == 3:
        col_counter -= 3; row_counter +=1
fig_counts.update_xaxes(type = 'log', range = [0,2], gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_counts.update_yaxes(type = 'log', range = [-10,0], tickformat = '1.0e', dtick = 3, gridcolor = 'rgba(0,0,0,0.15)', gridwidth = 1)
fig_counts.update_layout(font=dict(family = 'Helvetica', size = 14), colorway = plotly.colors.DEFAULT_PLOTLY_COLORS[:4], margin={'t':40,'l':60,'b':60,'r':10}, width = 800, height = 600)        
fig_counts.show()

In [None]:
fig_counts.write_image('plots/figS1b.svg')
fig_counts.write_image('plots/figS1b.pdf')

## Supplementary Data File SF1

In [None]:
counts_SF1 = dict()
for file in completed:
    if file[:-15] not in counts_SF1.keys():
        counts_SF1[file[:-15]] = pd.read_pickle('repeat_distributions/mammalian_reference_genomes/'+file)['A'].replace(0, np.nan).dropna(how = 'all')
        if 'long' in counts_SF1[file[:-15]].index.get_level_values(0):
            counts_SF1[file[:-15]] = counts_SF1[file[:-15]].loc['long']
counts_SF1['T2T-CHM13'] = CHM13_counts['A'][repeats_1_4].replace(0, np.nan).dropna(how = 'all')
counts_SF1 = pd.concat(counts_SF1, axis=1).sort_index().dropna(how = 'all', axis=1).fillna(0).astype(int).reindex(range(1,501))

In [None]:
counts_SF1

In [None]:
counts_SF1.to_csv('repeat_distributions/SF1_mammalian_repeat_length_counts.csv')