In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from tqdm import tqdm
import gzip
import pandas as pd
from IPython.display import HTML

In [3]:
def fastq_kmer_histogram_prime3(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[-k:]
                    else:
                        k_seq = seq[-k - offset:-offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [4]:
def fastq_kmer_histogram_prime5(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[:k]
                    else:
                        k_seq = seq[offset: k + offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [101]:
fastq = '/staging/as/wenzhenl/re-ribo-analysis/SRP098789_human/sratofastq/SRR5227313.fastq.gz'

In [102]:
histogram3 = fastq_kmer_histogram_prime3(fastq, range(5, 30))

100%|██████████| 1000000/1000000 [00:37<00:00, 26433.10it/s]


In [105]:
histogram3[25]

CTGTAGGCACCATCAATAGATCGGA    3.3576
CCTGTAGGCACCATCAATAGATCGG    1.9011
ACTGTAGGCACCATCAATAGATCGG    1.8399
TCTGTAGGCACCATCAATAGATCGG    1.5492
GCTGTAGGCACCATCAATAGATCGG    1.0938
CCCTGTAGGCACCATCAATAGATCG    0.9382
CAACGCGACTGTAGGCACCATCAAT    0.9229
GCCTGTAGGCACCATCAATAGATCG    0.8400
CTCTGTAGGCACCATCAATAGATCG    0.8225
GACTGTAGGCACCATCAATAGATCG    0.8213
TCCTGTAGGCACCATCAATAGATCG    0.7803
TTCTGTAGGCACCATCAATAGATCG    0.6859
CACTGTAGGCACCATCAATAGATCG    0.6241
ACCTGTAGGCACCATCAATAGATCG    0.6052
AACTGTAGGCACCATCAATAGATCG    0.6030
TGTAGGCACCATCAATAGATCGGAA    0.5881
GTCTGTAGGCACCATCAATAGATCG    0.5138
ATCTGTAGGCACCATCAATAGATCG    0.5009
GGCTGTAGGCACCATCAATAGATCG    0.4746
TGCTGTAGGCACCATCAATAGATCG    0.4688
TACTGTAGGCACCATCAATAGATCG    0.3980
AGCTGTAGGCACCATCAATAGATCG    0.3646
CCCCTGTAGGCACCATCAATAGATC    0.3174
GGCCTGTAGGCACCATCAATAGATC    0.3136
GCCCTGTAGGCACCATCAATAGATC    0.2973
CTCCTGTAGGCACCATCAATAGATC    0.2956
GAGCACACGTCTGAACTCCAGTCAC    0.2951
CCTCTGTAGGCACCATCAATAGATC   

In [80]:
histogram5 = fastq_kmer_histogram_prime5(fastq)

 13%|█▎        | 129022/1000000 [00:06<00:42, 20597.19it/s]


KeyboardInterrupt: 

In [22]:
len('TCGTATGCCGTCTTCTGCTTG')

21