In [1]:
import pandas as pd
import numpy as np
import bisect

# External data

In [2]:
fasta_index_fn = "../data/rel2/rel2.fastq.fai"

In [3]:
fasta_index = pd.read_csv(fasta_index_fn,
                          sep='\t',
                          names=["NAME", "LENGTH", "OFFSET", "LINEBASES", "LINEWIDTH", "QUALOFFSET"])

# Various numbers from the text

Total number of reads:

In [4]:
fasta_index.shape

(11069717, 6)

Total length:

In [5]:
total_length = np.sum(fasta_index.LENGTH)
genome_len = 3.1 * 10**9
total_coverage = total_length / genome_len
print(total_length / 10**9, total_coverage)

155.088240312 50.0284646168


N50:

In [6]:
sorted_lengths = np.sort(fasta_index.LENGTH)[::-1]
n50_index = bisect.bisect_left(np.cumsum(sorted_lengths), total_length / 2)
sorted_lengths[n50_index]

70416

In [7]:
max_short_len = 5000

Number of short reads

In [8]:
short_reads_index = fasta_index.LENGTH < max_short_len
np.sum(short_reads_index)

7651424

Coverage of human genome with short reads:

In [9]:
total_short_read_len = np.sum(fasta_index.LENGTH[short_reads_index])
total_short_read_len / genome_len

1.6971010006451612

Min long len:

In [10]:
min_long_len = 50000

Number of long reads

In [11]:
long_reads_index = fasta_index.LENGTH > min_long_len
np.sum(long_reads_index)

999562

Total length of long reads and coverage of human genome with long reads:

In [12]:
total_long_read_len = np.sum(fasta_index.LENGTH[long_reads_index])
total_long_read_len / 10**9, total_long_read_len / genome_len

(98.957643462999997, 31.921820471935483)