In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from collections import Counter
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from tqdm import tqdm
import gzip
import pandas as pd
from IPython.display import HTML

In [3]:
def fastq_kmer_histogram(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[-k:]
                    else:
                        k_seq = seq[-k - offset:-offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [4]:
def fastq_kmer_histogram2(fastq_file,
                         kmer_length=range(5, 31),
                         five_prime=False,
                         max_seq=1000000,
                         offset=0):
    """Get a histogram of kmers from a fastq file
    Parameters
    ----------
    fastq_file: string
                Location of .fastq or .fastq.gz
    kmer_length: range
                 Range of kmer to consider
    five_prime: bool
                Should consider sequences from 5' end?
                Default: False (uses sequence from 3' end)
    max_seq: int
             Maximum number of sequences to consider
    offset: int
            Offset to ignore at the 5' end or 3'end
            Example: If the offset is 3, the first 3 bases will be skipped
            and kmers will start only from the 4th position
            For 3'end if the offset is 3, the last 3 bases will be skipped
    Returns
    -------
    kmers: Series
           Sorted series with most frequent kmer
    """
    cur_count = 0
    should_continue = True
    if '.gz' in fastq_file:
        # Open as a gzip file
        handle = gzip.open(fastq_file, 'rt')
    else:
        handle = open(fastq_file, 'r')
    histogram = {k: Counter() for k in kmer_length}

    with tqdm(total=max_seq) as pbar:
        for title, seq, qual in FastqGeneralIterator(handle):
            if not should_continue:
                break
            cur_count += 1
            for k in kmer_length:
                if not five_prime:
                    if not offset:
                        k_seq = seq[:k]
                    else:
                        k_seq = seq[offset: k + offset]
                else:
                    k_seq = seq[offset:k + offset]
                histogram[k][k_seq] += 1
                if cur_count >= max_seq:
                    should_continue = False
            pbar.update()
    handle.close()
    kmers = {}
    for k, v in histogram.items():
        kmers[k] = pd.Series(v).sort_values(ascending=False) / max_seq * 100
    return kmers

In [10]:
fastq = '/staging/as/wenzhenl/re-ribo-analysis/SRP010679_human/sratofastq/SRR403893.fastq.gz'

In [11]:
histogram = fastq_kmer_histogram(fastq)

100%|██████████| 1000000/1000000 [00:40<00:00, 24747.34it/s]


In [162]:
histogram2 = fastq_kmer_histogram2(fastq)

100%|██████████| 1000000/1000000 [00:37<00:00, 26879.39it/s]


In [17]:
histogram[10]

CTGTAGGCAC    16.9524
TGTAGGCACC     9.7078
TCTGTAGGCA     5.3376
CCTGTAGGCA     5.0915
ACTGTAGGCA     4.5734
GTAGGCACCA     4.5082
GCTGTAGGCA     4.3464
CTCTGTAGGC     1.7447
CCCTGTAGGC     1.7358
TAGGCACCAT     1.7018
TGCTGTAGGC     1.3124
AGCTGTAGGC     1.2404
GACTGTAGGC     1.1522
CTTCTGTAGG     1.0815
TTCTGTAGGC     1.0443
TCCTGTAGGC     1.0211
GGCTGTAGGC     1.0174
TCTGCTTGAA     0.9383
GCCTGTAGGC     0.9122
TACTGTAGGC     0.9053
AGGCACCATC     0.8680
CACTGTAGGC     0.8593
AACTGTAGGC     0.7816
GCCCTGTAGG     0.6607
ACCTGTAGGC     0.6029
CGCTGTAGGC     0.5944
ATCTGTAGGC     0.5868
GTCTGTAGGC     0.5458
GGCACCATCA     0.5255
ACACTGTAGG     0.5028
               ...   
CGTAGGGCCA     0.0001
GTCTTGGAGG     0.0001
CGTAGGGCGC     0.0001
GTCTTGAAGG     0.0001
CGTAGGTACA     0.0001
CGTAGGTACC     0.0001
CGTATCTGTA     0.0001
GTCTTCTGTA     0.0001
CGTATGCAGT     0.0001
GTCTTCTGGC     0.0001
CGTATGCCGG     0.0001
GTCTTAGACA     0.0001
CGTATGCGGT     0.0001
CGTATGCTCC     0.0001
CGTATGCTGT

In [166]:
class PDF(object):
  def __init__(self, pdf, size=(200,200)):
    self.pdf = pdf
    self.size = size

  def _repr_html_(self):
    return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

  def _repr_latex_(self):
    return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)


In [25]:
f = 'results/SRR5227306SRR5227306.pdf'

In [23]:
PDF(f, size=(300, 200))

NameError: name 'PDF' is not defined

In [198]:
HTML('<iframe src=http://fperez.org/papers/ipython07_pe-gr_cise.pdf width=700 height=350></iframe>')

In [26]:
HTML('<iframe src={} width=700 height=350></iframe>'.format(f))