# adapter-checks

A notebook to debug adapter content.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import os

In [None]:
def barplot(items, name):
    """
    Plot average adapter content % per adapter sequence
    """
    stats_list = []
    for stats in items:
        data = pd.read_csv(stats, sep='\t', skiprows=3)
        filename = stats.split('/')[-1]
        data['sample'] = filename
        stats_list.append(data)
    
    long_data = pd.concat(stats_list).reset_index()
    long_data['ReadsPct'] = long_data.ReadsPct.str.slice(0,-1).astype(float)
    
    freq_list = long_data.groupby('#Name')['ReadsPct'].mean().sort_values(ascending=False)
    
    # The actual average frequency list
    # Note that this SUMS to 1%
    fig, ax = plt.subplots(figsize=(3,6))
    freq_list[0:int(min(len(freq_list), 20))].iloc[::-1].plot.barh(ax=ax)
    plt.title(f'Average adapter detected via bbduk ({name})')
    plt.suptitle(f'Sum of below: {sum(freq_list):.03f}%')
    plt.xlabel('% detected')
    plt.ylabel('Adapter sequence')

In [None]:
def heatmap(items, name):
    """
    Plot a per-sample and per-adapter % heatmap
    """
    stats_list = []
    for stats in items:
        data = pd.read_csv(stats, sep='\t', skiprows=3)
        filename = stats.split('/')[-1]
        data['sample'] = filename
        stats_list.append(data)

    df = pd.DataFrame([ x.set_index("#Name").ReadsPct.str.slice(0,-1).astype(float) for x in stats_list ])
    df.index = [ x["sample"][0].split('_')[0] for x in stats_list ]
    # Save data
    df.to_csv(name + "_adapter_percent.csv")
    df = df.T
    df = df[(df > 0.001).T.any()]
    df = df.iloc[0:min(len(df), 30)]
    
    fig, ax = plt.subplots(figsize=(8,6))
    img = ax.imshow(df, aspect='auto')
    import numpy as np
    ax.set_yticks(np.arange(len(df.index)), labels=df.index, fontsize=8)
    ax.set_xticks(np.arange(len(df.columns)), labels=df.columns, fontsize=8)
    
    cbar = plt.colorbar(img)
    cbar.ax.set_ylabel("Percentage of reads detected")

    plt.title(f'Average adapter detected via bbduk ({name})')
    ax.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False)
    _ = plt.setp(ax.get_xticklabels(), rotation=270, ha="right", va="center",
             rotation_mode="anchor")


In [None]:
files = glob("*_stats.*.txt")

sorted_files = { k:[] for k in [ x.split('.')[-2] for x in files ] }
for x in files:
    name = x.split('.')[-2]
    sorted_files[name].append(x)

items = sorted(list(sorted_files.keys()))

## Average detected adapter percentages (real adapter sequences)

This adapter reference consists of 'bbmap_adapters.fa' combined with detected adapters from bbmerge.
Both parameter combinations are shown.

In [None]:
adapters = [ x for x in items if x.startswith('adapters') ]
barplot(sorted_files[adapters[0]], adapters[0])

In [None]:
barplot(sorted_files[adapters[1]], adapters[1])

## Average detected adapter percentages (random adapter sequences)

This adapter reference is completely randomly generated, with the same number of sequences as the first.
Both parameter combinations are shown.

In [None]:
random = [ x for x in items if x.startswith('random') ]
barplot(sorted_files[random[0]], random[0])

In [None]:
barplot(sorted_files[random[1]], random[1])

# Per-sample heatmaps

Here we plot a heatmap containing each individual sample as well as the top adapters

## Real adapter sequences heatmaps

In [None]:
adapters = [ x for x in items if x.startswith('adapters') ]
heatmap(sorted_files[adapters[0]], adapters[0])

In [None]:
heatmap(sorted_files[adapters[1]], adapters[1])

## Average detected adapter percentages (random adapter sequences)

This adapter reference is completely randomly generated, with the same number of sequences as the first.
Both parameter combinations are shown.

In [None]:
random = [ x for x in items if x.startswith('random') ]
heatmap(sorted_files[random[0]], random[0])

In [None]:
heatmap(sorted_files[random[1]], random[1])