In [20]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [21]:
from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from tqdm import tqdm
import gzip
import pandas as pd
from IPython.display import HTML

In [22]:
def fastq_kmer_histogram_prime3(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[-k:]
                    else:
                        k_seq = seq[-k - offset:-offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [23]:
def fastq_kmer_histogram_prime5(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[:k]
                    else:
                        k_seq = seq[offset: k + offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [90]:
fastq = '/staging/as/wenzhenl/re-ribo-analysis/SRP003554_mouse/sratofastq/SRR065779.fastq.gz'

In [91]:
histogram3 = fastq_kmer_histogram_prime3(fastq, range(5, 30))

100%|██████████| 1000000/1000000 [00:44<00:00, 22495.59it/s]


In [162]:
histogram5 = fastq_kmer_histogram_prime5(fastq)

100%|██████████| 1000000/1000000 [00:37<00:00, 26879.39it/s]


In [92]:
histogram3[7]

TCGTATG    18.8092
GTCGTAT     6.5643
CTCGTAT     5.2163
GGAATTC     5.2101
CGTATGC     4.6804
TTCGTAT     4.3090
TATGCCG     4.2500
ATCGTAT     3.5390
GTATGCC     2.5324
CGCACTC     2.1907
GGAATCG     1.8909
GAATTCG     1.5268
ACATCGT     1.5255
CGCATCG     1.3466
TCGCACT     1.0576
GGTCGTA     1.0263
TGTCGTA     1.0243
TCGCATC     1.0068
AGTCGTA     0.9477
TCGTATT     0.8713
TCTTATG     0.7547
AGCTCTC     0.7468
ACATGTC     0.6886
CGCTCGT     0.6379
ATGCCGT     0.5927
CATCGTA     0.5782
CTTCGTA     0.5743
GGAATAT     0.5188
ACACATC     0.5172
TCGTCTG     0.5023
            ...   
GAGTCCT     0.0001
TTNTTCC     0.0001
GAGTCTA     0.0001
GAGTGAA     0.0001
TTNNNNT     0.0001
GAGCTTA     0.0001
GAGTTGG     0.0001
GAGTTTG     0.0001
GAGTTTT     0.0001
GANNNCG     0.0001
GANTGCA     0.0001
GATAACG     0.0001
GAGTATT     0.0001
GAGTATC     0.0001
GAGTAGA     0.0001
GAGNNNN     0.0001
GAGGTTT     0.0001
GAGGTAG     0.0001
GAGGTAC     0.0001
GAGGGGC     0.0001
GAGGGCT     0.0001
GAGGGCG     

In [166]:
class PDF(object):
  def __init__(self, pdf, size=(200,200)):
    self.pdf = pdf
    self.size = size

  def _repr_html_(self):
    return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

  def _repr_latex_(self):
    return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)


In [25]:
f = 'results/SRR5227306SRR5227306.pdf'