# Setup

In [7]:
# Imports

import matplotlib.pyplot as plt
import numpy as np
import shutil

import utils

from collections import Counter, OrderedDict
from pathlib import Path
from pprint import pprint
from PyPDF2 import PdfWriter

In [16]:
# Paths
clusters_dir = Path('../../../all_clusters')
data_dir = Path('data')
figures_dir = Path('figures')

# Create figures directory if none exists
if not figures_dir.exists():
    figures_dir.mkdir(parents=True, exist_ok=True)

# Load all data
tloops_raw = utils.load(data_dir/'tloops_raw.pickle')
tloops_filtered = utils.load(data_dir/'tloops_filtered.pickle')
fragments_8_filtered = utils.load(data_dir/'fragments_8_filtered.pickle')

In [10]:
# Other functions

def get_res_array(sequences:list[str], res_names:list[str]=['A','U','C','G']):
    res_array= np.zeros(shape=(len(res_names), len(sequences[0])))
    for seq in sequences:
        for pos, res in enumerate(seq):
            res_array[res_names.index(res)][pos] += 1
    res_array = res_array/res_array.sum(axis=0, keepdims=True)
    return res_array


def merge_pdfs_in_dir(dir:Path, filename:str='merged_pdf'):
    merger = PdfWriter()
    for item in dir.iterdir():
        merger.append(item)
    merger.write(dir.parent/(filename + '.pdf'))
    merger.close()
    shutil.rmtree(dir)


def format_filename(text:str):
    return text.lower().replace(' ','_')

In [11]:
# Plot functions

# Plot bar graph
def plot_bar(data:dict, title:str='', dir:Path=figures_dir) -> None:
    
    ax = plt.subplots(figsize=(len(data)*0.3,3))[1]
    bar_plot = ax.bar(range(len(data)), data.values(), align='edge', width=0.5)
    ax.bar_label(bar_plot, rotation='vertical', padding=5)
    ax.set_xticks(range(len(data)), data.keys(), rotation='vertical', horizontalalignment='left')
    ax.set_title(title, y=1.1)
    
    # Cosmetic adjustments
    ax.margins(x=0.5/len(data), tight=True)
    ax.spines[['right', 'top']].set_visible(False)
    
    save_pdf(dir/format_filename(title))


# Plot stacked bar graph
def plot_stacked_bar(x, array, labels:list[str], title:str='', dir:Path=figures_dir) -> None:
    ax = plt.subplots(figsize=(len(x)*0.3,3))[1]
    ax.bar(x, array[0], label=labels[0])
    for i in range(1, len(array)):
        ax.bar(x, array[i], bottom=np.sum(array[:i], axis=0), label=labels[i])
    ax.set_title(title)
    ax.legend(loc='upper right', bbox_to_anchor=(1.5,1.03))
    
    save_pdf(dir/format_filename(title))


def save_pdf(path:Path):
    plt.savefig(Path(path.with_suffix('.pdf')), bbox_inches='tight')
    plt.close()

In [14]:
def fragment_stats(data, label:str ='') -> None:

    # Sequence histogram
    seqs = [i.res_seq for i in data]
    seqs_dict = OrderedDict(Counter(seqs).most_common())
    plot_bar(seqs_dict, f'{label} sequence frequencies')
    
    # Sequence by cluster
    tmp_dir = figures_dir/'tmp'#Path(f'{figures_dir}/tmp')
    if tmp_dir.exists() and tmp_dir.is_dir():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True, exist_ok=True)
    
    clust_ids = set([i.clust_id for i in data])
    for c in clust_ids:
        clust_seqs = [t.res_seq for t in data if t.clust_id == c]
        clust_seqs_dict = OrderedDict(Counter(clust_seqs).most_common())
        plot_bar(clust_seqs_dict, f'{label} cluster {c} sequence frequencies', dir=tmp_dir)
    merge_pdfs_in_dir(tmp_dir, format_filename(f'{label} cluster sequence frequencies.pdf'))
    
    # Cluster histogram
    clust_ids = [t.clust_id for t in data]
    clust_ids_dict = OrderedDict(sorted(Counter(clust_ids).items()))
    plot_bar(clust_ids_dict, f'{label} cluster frequencies')
    
    # Residue position histogram
    res_array = get_res_array(seqs)
    plot_stacked_bar(range(len(seqs[0])), res_array, ['A','U','C','G'], f'{label} residue position frequencies')

    # PDB IDs
    pdb_ids = [i.pdb_id for i in data]
    
    print(
        f'{label}:\n'
        f'- Amount: {len(data)}\n'
        f'- Unique sequences: {len(set(seqs))}\n'
        f'- PDB IDs: {len(set(pdb_ids))}\n'
        f'Figures saved to /{figures_dir}\n'
    )

# Tetraloops
> The final tetraloop database is composed by 21,993 RNA fragments: 17,709 from X-ray, 3057 from NMR and 1227 from cryo-EM structures. The distributions of resolutions are shown in Fig. S1. [Bottaro et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5529312/)

According to Table SI1, there should be a total of 19383 tetraloops. I don't know where/what the remaining 2610 RNA fragments (from the total of 21993) are. The total number of "effective" tetraloops (members whose distance is above 0.07 eRMSD) is 16979.

In [17]:
# Cluster groups, as named in Bottaro et al. Table SI1
gnra = [1]
gnra_like = [1, 3, 6, 9, 25, 26, 36, 40]
uncg = [2]
uncg_like = [2, 5, 37, 44]
u_turn = [4]

fragment_stats(tloops_raw, 'Raw tetraloops')
fragment_stats(tloops_filtered, 'Filtered tetraloops')
tloop_fragments = [i for i in fragments_8_filtered if i.clust_id != 0]
fragment_stats(tloop_fragments, 'Generated tetraloops')

Raw tetraloops:
- Amount: 19383
- Unique sequences: 292
- PDB IDs: 864
Figures saved to /figures

Filtered tetraloops:
- Amount: 11952
- Unique sequences: 292
- PDB IDs: 864
Figures saved to /figures

Generated tetraloops:
- Amount: 151586
- Unique sequences: 240
- PDB IDs: 525
Figures saved to /figures



The high number of tetraloops found the fragment dataset is probably due to chain redundancy, as all tetraloops belonging to a PDB structure were aligned to *all* chains in said PDB structure. However, 53 unique tetraloops originally found in both the raw and filtered tetraloop datasets are lost. Where the hell are they disappearing to? My guess right now is the chain annotation process somehow filters out some of the tetraloops, because they don't fulfill the conditions of a) having identical residue names and b) having identical residue numbers.

UGH I TRIED EVERY FILTERING OPTION AND ITS STILL NOT WORKING WHATS HAPPENING 

is it because some of the pdb ids are somehow being filtered out??

# Fragments

## Unique fragments
The number of unique RNA fragments is $4^{8} = 65536$. The theoretical maximum number of fragments that can be generated from this dataset is 495279. The number of unique generated fragments is 34431, which represents approximately 53% of all possible unique fragments. 

Expected number of tries to obtain $x$ unique sequences:
$$\sum\limits_{i=0}^x \frac{n}{n-i}$$
where $n$ = total number of unique sequences

The expected number of tries needed to obtain *all* unique fragments is 764646. The expected number of tries needed to obtain 34431 fragments is 48839.

The dataset is most likely too small to account for *all* possible residue permutations, but should be large enough to account for more than 53% of all residues. For reference, a dataset of size 451750 is expected to return 99.9% of all unique fragments.

- Are certain residues/motifs overrepresented in the dataset?
- Will increasing the dataset to include more PDB structures result in more unique fragments?
- Is the sequence database too redundant? Should sequence similarity *between* different PDB structures also be filtered out of the reference database?
- Are the sequences palindromic or otherwise symmetrical?

In [21]:
pos_unique_frags = 4**8
print(f'Possible unique fragments: {pos_unique_frags}')

expected_tries_all = sum([pos_unique_frags/(pos_unique_frags-i) for i in range(pos_unique_frags)])
print(f'Expected # tries to obtain all unique fragments: {round(expected_tries_all)}')

Possible unique fragments: 65536
Expected # tries to obtain all unique fragments: 764646
