# Detecting adapters

For datasets where the adapter is tricky, we need to be able to figure out it ourselves. The strategy is simple: Starting from a 12nt to 20nt, create a dictionary of the number
    of occurences of a particular kind of sequences. Then greedily select an adapter starting from 20nt long.
    

In [199]:
%pylab inline
%load_ext autoreload
%autoreload 2
    
import Levenshtein
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import pandas as pd
import gzip
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import hamming_loss
from tqdm import tqdm
from collections import Counter, defaultdict


Populating the interactive namespace from numpy and matplotlib


ValueError: 'autoreload' was not found in history, as a file, url, nor in the user namespace.

In [151]:
def collapse_kmer_counts(kmer_counts_series, allowed_mismatches=3):
    """For a given list of kmer counters,
    collapse them allowing for atmost `allowed_mismatches`
    
    NOTE: This method should almost never be required, since there
    are better ways to deal with finding enriched sequences in case
    the adapters is not auto detected by trim_galore. 
    
    The idea is straight forward. Do one pass of trimming with standard
    illumina adapters. Then using these trimmed sequences find out enriched
    sequence in the 
    """
    k = len(kmer_counts_series.index.tolist()[0])
    kmers =  np.array(list(map(lambda x: list(x), kmer_counts_series.index.tolist()))).view(numpy.uint32)
    counts = kmer_counts_series.values.tolist()    
    ## Too slow and not ideal
    ## Should instead use Levenshtein distance which is more
    ## relevant anyway (since the adapters coould be present in partial)
    distances = pairwise_distances(kmers, metric=hamming_loss) * k
    return np.argwhere(distances <= allowed_mismatches), distances  
  
def get_histogram(fastq_file, adapter_length = range(12, 31), prime5=False):
    max_seq = 1000000
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in adapter_length}

    
    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in adapter_length:    
                if not prime5:
                    k_seq = seq[-k:]
                else:
                    k_seq = seq[3:k+3]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    histogram_series  = {}    
    for k,v in histogram.items():
        histogram_series[k] = pd.Series(v).sort_values(ascending=False)/max_seq*100
    return histogram_series

def hamming_distance(x, y):
    """Calculate hamming distance between two strings"""
    assert len(x) == len(y)
    count = 0
    z = int(x,2)^int(y,2)
    while z:
        count += 1
        z &= z-1
    return count



In [198]:
a = [1,2,3,4,5,6]
a[-3-0:0]

[]

In [157]:
a = {'a': Counter(), 'b': Counter()}
a['a']['a'] += 1
a['a']['b'] += 1


In [195]:
a['a'].most_common()/30

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [185]:
fastq_17nt_adapt = '/staging/as/skchoudh/re-ribo-analysis/hg38/SRP098789/sratofastq/SRR5227288.fastq.gz'
fastq_17nt_adapt_trimmed = '/staging/as/skchoudh/re-ribo-analysis/hg38/SRP098789/preprocessed/SRR5227288_trimmed.fq.gz'
fastq_13nt_adapt = '/home/cmb-06//as/skchoudh/dna/Dec_12_2017_Penalva_RPS5_Riboseq/Penalva_L_12112017/RPS5_C2_S2_L001_R1_001.fastq.gz'
fastq_amb = '/staging/as/skchoudh/re-ribo-analysis/hg38/SRP031501_human_remap_v2/sratofastq/SRR1562541.fastq.gz'
# indices, distances = collapse_kmer_counts(histogram_series[22])
fastq_erx = '/staging/as/wenzhenl/re-ribo-data/ERP005378/ERX432360/ERR466125.fastq'

fastq_5prime = '/staging/as/skchoudh/re-ribo-analysis/hg38/SRP017942/preprocessed/SRR648669_trimmed.fq.gz'

In [186]:
histogram_5prime = get_histogram(fastq_amb, range(2, 17), True)

100%|██████████| 1000000/1000000 [00:33<00:00, 29441.37it/s]


In [194]:
histogram_5prime[12]

AAACCATTCGTA    5.7625
TACACGGAGTCG    5.1065
AACCATTCGTAG    4.7467
CCCGGGGCTACG    4.5125
GCGGGGCGCGGG    1.5839
ACCATTCGTAGA    1.3973
GCCGCGACCGGC    1.0804
TCCGCCCGGAGG    1.0590
GGGCTACGCCTG    1.0378
ACTTCTTAGAGG    0.9887
CGGGGCTACGCC    0.9678
CCGCCCGGAGGA    0.8793
TCGCCGCGCTCT    0.8411
CCGGGGCTACGC    0.7693
CCGAGGGCGCAC    0.7187
GCGCCGCGACCG    0.6892
CCTGGATACCGC    0.5999
GGGGGCCCAAGT    0.5241
GAGTCGAGCTCA    0.4639
CGTAGACGACCT    0.4553
GGGGCTACGCCT    0.4183
CCGCGACCGGCT    0.4143
CCACGCAGTTTT    0.4115
CGCCGCGACCGG    0.4049
CGGCGTCCGGTG    0.3977
GGGGGGCCCAAG    0.3806
TAGGCACCATCA    0.3700
GGCTACGCCTGT    0.3573
CCATTCGTAGAC    0.3452
CGGGGCGCGGGA    0.3276
                 ...  
GCTCAGTACAGC    0.0001
GCTCAGGTTCTG    0.0001
GCTCAGGTGGAA    0.0001
GCTCAGGTGATT    0.0001
GCTCAGTTCGGG    0.0001
GCTCAGTTGAAA    0.0001
GCTCAGTTGTGA    0.0001
GCTCAGTTTAGA    0.0001
GCTCATCAAGAA    0.0001
GCTCATCAACCG    0.0001
GCTCATCAAAGC    0.0001
GCTCATCAAAAG    0.0001
GCTCATATTGG

In [179]:
histogram_amb = get_histogram(fastq_amb)

100%|██████████| 1000000/1000000 [00:35<00:00, 28169.49it/s]


In [202]:
x = histogram_amb[12].cumsum()
histogram_amb[12][x<=50]

AAAAAAAAAAAA    39.8512
AAAAAAAAAACA     5.1158
AAAAAAAAACAA     4.9384
dtype: float64

In [78]:
histogram_17nt = get_histogram(fastq_17nt_adapt)

100%|██████████| 1000000/1000000 [00:30<00:00, 32853.45it/s]


In [136]:
histogram_erx = get_histogram(fastq_erx, range(2, 8))

100%|██████████| 1000000/1000000 [00:11<00:00, 84375.09it/s]


In [152]:
histogram_erx_5prime = get_histogram(fastq_erx, range(2, 17), True)

100%|██████████| 1000000/1000000 [00:21<00:00, 45788.32it/s]


In [156]:
histogram_erx_5prime[10]

ATAAAATACA    0.4430
TTAAAATACA    0.3127
ATTAAATACA    0.2805
ATAATATACA    0.2574
ATGAAATACA    0.2564
ATATAATACA    0.2515
ATAAAGTACA    0.2510
CTAAAATACA    0.2409
GTAAAATACA    0.2398
ATAAATTACA    0.2322
ATACAATACA    0.2286
ATAACATACA    0.2188
TTTAAATACA    0.2143
ATCAAATACA    0.2119
ATAAGATACA    0.2110
TTAATATACA    0.2017
ATAGAATACA    0.1991
ATAAACTACA    0.1981
TTATAATACA    0.1882
TTAAAGTACA    0.1850
TTAAATTACA    0.1819
ATAATGTACA    0.1786
ATTATATACA    0.1749
ATTAAGTACA    0.1746
ATTTAATACA    0.1735
TTAACATACA    0.1727
TTGAAATACA    0.1706
TTACAATACA    0.1701
GTTAAATACA    0.1670
ATTAATTACA    0.1641
               ...  
GTACGCGACT    0.0001
GTACGCAACG    0.0001
GTACGCAACA    0.0001
GTACTCCACT    0.0001
GTACTCTAGG    0.0001
GTAGAATAGT    0.0001
GTACTTAACA    0.0001
GTAGAATAAC    0.0001
GTAGAAGACT    0.0001
GTAGAACACC    0.0001
GTAGAAAACC    0.0001
GTACTTTCCA    0.0001
GTACTTTATG    0.0001
GTACTTTAGG    0.0001
GTACTTTAGC    0.0001
GTACTTCACC    0.0001
GTACTTCACA   

In [137]:
histogram_erx[3]

CGC    8.0965
AGC    5.7866
ACC    4.9230
GCG    4.6937
AAC    3.7450
TGC    3.5676
CAC    3.3619
AGT    3.2964
AAT    3.1778
ACG    3.0247
AAG    2.5393
AGG    2.3214
CGG    2.2242
ATG    2.1548
ATC    1.8372
GCT    1.8144
CCC    1.7975
GAG    1.7856
TGT    1.7405
GGC    1.7192
GCC    1.7064
CCG    1.6525
CGT    1.6482
GAC    1.6314
ACT    1.5139
TGG    1.4918
CAG    1.4762
CAA    1.3454
TCG    1.3355
GGG    1.3122
        ...  
TTA    0.0621
NNG    0.0073
NNC    0.0061
NNT    0.0038
GNG    0.0030
NGC    0.0019
NTG    0.0018
NTC    0.0015
NAT    0.0014
NCG    0.0014
NAG    0.0013
NGG    0.0013
NAC    0.0011
ANG    0.0010
NCT    0.0009
NGT    0.0009
ANC    0.0008
CNC    0.0008
GNC    0.0007
NCC    0.0007
ANT    0.0006
NTT    0.0006
CNA    0.0003
CNG    0.0003
TNG    0.0003
CNT    0.0003
TNT    0.0002
GNT    0.0002
GNA    0.0002
TNC    0.0001
Length: 92, dtype: float64

In [109]:
histogram_17nt_trimmed = get_histogram(fastq_17nt_adapt_trimmed, range(1,20))

100%|██████████| 1000000/1000000 [00:26<00:00, 38008.10it/s]


In [79]:
histogram_13nt = get_histogram(fastq_13nt_adapt)

100%|██████████| 1000000/1000000 [00:27<00:00, 36683.91it/s]


In [None]:
 ATCAATAGATCGG
CATCAATAGATCG

In [126]:
histogram_17nt[24]

CTGTAGGCACCATCAATAGATCGG    12.8822
TGTAGGCACCATCAATAGATCGGA     5.9053
GGGCTGTAGGCACCATCAATAGAT     5.2111
TAGGCACCATCAATAGATCGGAAG     5.1466
GCTGTAGGCACCATCAATAGATCG     5.1173
GTAGGCACCATCAATAGATCGGAA     4.0117
AGGCACCATCAATAGATCGGAAGA     3.9247
GGCTGTAGGCACCATCAATAGATC     3.3850
TCTGTAGGCACCATCAATAGATCG     3.0782
CCTGTAGGCACCATCAATAGATCG     2.6515
ACTGTAGGCACCATCAATAGATCG     1.9075
TCGCTCTGTAGGCACCATCAATAG     1.8494
GTCGCTCTGTAGGCACCATCAATA     1.7734
GTCGGCACCATCAATAGATCGGAA     1.6818
TGGGCTGTAGGCACCATCAATAGA     1.6517
TGTCGGCACCATCAATAGATCGGA     1.6240
GGCACCATCAATAGATCGGAAGAG     1.5730
GACTGTAGGCACCATCAATAGATC     1.4337
CGCTCTGTAGGCACCATCAATAGA     1.1283
CACTGTAGGCACCATCAATAGATC     0.9891
AGCTGTAGGCACCATCAATAGATC     0.9823
CGCTGTAGGCACCATCAATAGATC     0.9644
CTGGGCTGTAGGCACCATCAATAG     0.9383
TGCTGTAGGCACCATCAATAGATC     0.9232
GCACCATCAATAGATCGGAAGAGC     0.8923
TCGCTGTAGGCACCATCAATAGAT     0.8361
CACCATCAATAGATCGGAAGAGCA     0.7846
CCCTGTAGGCACCATCAATAGATC    

In [120]:
histogram_17nt_trimmed[13]


AGGCACCATCAAT    56.3403
AGGCCCCATCAAT     9.2095
CGGTGAGGCGGGG     3.9172
CCCGGTGAGGCGG     2.0312
CCGGTGAGGCGGG     1.4687
AGGCACCCTCAAT     1.4406
GGCTGTAGGCACC     1.0358
GTAGGCACCATCA     0.8812
GCGGGGCTGTAGG     0.7997
GGTGAGGCGGGGC     0.7734
ACCCGGTGAGGCG     0.7553
GAGGCGGGGCTGT     0.5164
GACCGGCTCCGGG     0.4308
CGGGACGGCTGGG     0.4012
GACCCGGTGAGGC     0.2995
CACTGACCCGGTG     0.2917
CCGGGACGGCTGG     0.2624
TGTAGGCACCATC     0.2487
ACGGCTGGGCTGT     0.2446
GTGAGGCGGGGCT     0.2386
TGACCCGGTGAGG     0.2317
GTAGGCCCCATCA     0.2214
TCCGGGACGGCTG     0.2126
CTCCGGGACGGCT     0.2092
TAGCACCATCAAT     0.2077
TGAGGCGGGGCTG     0.1990
TAGGCACATCAAT     0.1938
GCTCCGGGACGGC     0.1879
GGGACGGCTGGGC     0.1872
CGGCTCCGGGACG     0.1868
                  ...   
GAGGCCCCATCAT     0.0001
TCTGAGCGTCCTG     0.0001
GAGGCCCTCGGCT     0.0001
GAGGATTCAACCC     0.0001
GAGGATGTGGTCT     0.0001
GAGGATGGGAAGA     0.0001
GAGGATACGCCTT     0.0001
GAGGAGGAAATGG     0.0001
GAGGAGGAGCTGT     0.0001


In [204]:
len('CTGTAGGCACCATCAAT')

17

In [107]:
histogram_17nt_trimmed[14]
   TAGGCACCATCAAT
    AGGCACCATCAAT  
CTGTAGGCACCATCAAT    
CTGTAGGCACCATCAAT
CTGTAGGCACCATCAATAGATCGG
                 AGATCGGAAGAGC

TAGGCACCATCAAT    56.1691
TAGGCCCCATCAAT     9.1797
CCGGTGAGGCGGGG     3.9039
ACCCGGTGAGGCGG     2.0184
CCCGGTGAGGCGGG     1.4643
TAGGCACCCTCAAT     1.4355
GGGCTGTAGGCACC     1.0169
TGTAGGCACCATCA     0.8768
GGCGGGGCTGTAGG     0.7988
CGGTGAGGCGGGGC     0.7708
GACCCGGTGAGGCG     0.7267
TGAGGCGGGGCTGT     0.5148
CGACCGGCTCCGGG     0.4255
CCGGGACGGCTGGG     0.4005
TGACCCGGTGAGGC     0.2988
TCACTGACCCGGTG     0.2917
TCCGGGACGGCTGG     0.2623
CTGTAGGCACCATC     0.2468
GACGGCTGGGCTGT     0.2416
GGTGAGGCGGGGCT     0.2382
CTGACCCGGTGAGG     0.2302
TGTAGGCCCCATCA     0.2206
CTCCGGGACGGCTG     0.2123
GCTCCGGGACGGCT     0.2085
GTAGCACCATCAAT     0.2073
GTGAGGCGGGGCTG     0.1984
GTAGGCACATCAAT     0.1936
GGCTCCGGGACGGC     0.1871
CGGGACGGCTGGGC     0.1865
CCGGCTCCGGGACG     0.1856
                   ...   
GAGTGGCAGCGGCA     0.0001
GAGTCGGGTTGCTC     0.0001
TCTGTANGCACCAT     0.0001
GAGTCGGGGTCTGT     0.0001
GAGTCCAGCCTCTT     0.0001
TCTGTCTGGCACTT     0.0001
GAGTATCCCNCTGT     0.0001
GAGTATGAGGAC

In [84]:
hamming_loss(np.array(list('AGAGCACACGTC')).view(np.uint32), np.array(list('GAGCACACGTCT')).view(np.uint32) )

1.0

In [100]:
Levenshtein.distance('AGATCGGAAGAGCACACGTCTG', 'GAGATCGGAAGAGCACACGTCT')

2

In [102]:
histogram_13nt[26]

GAGCGAGATCGGAAGAGCACACGTCT    5.8217
GAGCGAAGATCGGAAGAGCACACGTC    4.8265
AGGCTTAGATCGGAAGAGCACACGTC    3.0751
GAGATCGGAAGAGCACACGTCTGAAC    2.5127
AGATCGGAAGAGCACACGTCTGAACT    1.9054
CGGGGATTAGATCGGAAGAGCACACG    1.5329
GCTGGGAGATCGGAAGAGCACACGTC    1.3607
TTGAAGATCGGAAGAGCACACGTCTG    1.2841
TAGGCTTAGATCGGAAGAGCACACGT    1.0785
GATCGGAAGAGCACACGTCTGAACTC    1.0202
CCCGCTGAAAGATCGGAAGAGCACAC    0.8253
CAGATCGGAAGAGCACACGTCTGAAC    0.7178
AGCGAGATCGGAAGAGCACACGTCTG    0.7073
GGAGCGAAGATCGGAAGAGCACACGT    0.7013
CCCGCTGAAGATCGGAAGAGCACACG    0.6926
GGAGATCGGAAGAGCACACGTCTGAA    0.6853
TGAGATCGGAAGAGCACACGTCTGAA    0.6772
TTGAGATCGGAAGAGCACACGTCTGA    0.6435
GAGCAGATCGGAAGAGCACACGTCTG    0.6398
CGAGATCGGAAGAGCACACGTCTGAA    0.5951
ATCGGAAGAGCACACGTCTGAACTCC    0.5717
GGCTGGGAGATCGGAAGAGCACACGT    0.5580
GGAGCGAGATCGGAAGAGCACACGTC    0.5409
GGCTTAGATCGGAAGAGCACACGTCT    0.5215
GGGAGCGAGATCGGAAGAGCACACGT    0.5014
AAGATCGGAAGAGCACACGTCTGAAC    0.4958
GTCGCTAGATCGGAAGAGCACACGTC    0.4915
G