## Example of fasq pair sequence alignment and filtering

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from Bio import Seq, SeqIO, Align, pairwise2
import atriegc

In [2]:
def import_pair_reads(fastq1_name, fastq2_name):
    reads1, reads2 = [], []
    q1, q2 = [], []
    
    fastq1 = SeqIO.parse('fastq/hiv/'+fastq1_name, "fastq")
    for record in fastq1:
        reads1.append(record.seq)
        q1.append(record.letter_annotations["phred_quality"])

    fastq2 = SeqIO.parse('fastq/hiv/'+fastq2_name, "fastq")
    for record in fastq2:
        reads2.append(record.seq)
        q2.append(record.letter_annotations["phred_quality"])
    
    return reads1, reads2, q1, q2

def join_by_q(r1, r2, q1, q2):
    q1, q2 = np.array(q1), np.array(q2)
    res =  np.where(q1>q2, list(r1), list(r2))
    qs = np.where(q1>q2, q1, q2)
    return ''.join(res), qs

def find_first_nucl(seq):
    iA, iC, iT, iG = seq.find('A'), seq.find('C'), seq.find('T'), seq.find('G')
    return min(iA, iC, iT, iG)

In [3]:
def join_pairs(align, q1, q2):
    
    start_i = find_first_nucl(align.seqB)
    if start_i==0:
        return [], []
    
    # Add the first seqA
    start_i_qa, start_i_qb = start_i, 0
    seq, qs = align.seqA[:start_i], np.array(q1[:start_i])
    #print(0, start_i)

    while start_i < len(align.seqA)-1:

        # Joining the overlap chunk before the first gap
        gap_indA, gap_indB = align.seqA[start_i:].find('-'), align.seqB[start_i:].find('-')
        if gap_indA == -1: gap_indA = len(align.seqA)
        else: gap_indA += start_i
        if gap_indB == -1: gap_indB = len(align.seqB)
        else: gap_indB += start_i
        last_i = min(gap_indA, gap_indB)
        delta = last_i-start_i

        sA, sB = align.seqA[start_i:last_i], align.seqB[start_i:last_i]
        qA, qB = q1[start_i_qa:start_i_qa+delta], q2[start_i_qb:start_i_qb+delta]
        new_s, new_qs = join_by_q(sA, sB, qA, qB)
        seq += new_s
        qs = np.append(qs, new_qs)
        #print(start_i, last_i)
        start_i = last_i+1

        # Resolving the gap found
        if gap_indA < gap_indB:
            seq += align.seqB[start_i-1]
            start_i_qa += delta
            start_i_qb += delta+1
            qs = np.append(qs, q2[start_i_qb-1])
        elif gap_indA > gap_indB:
            seq += align.seqA[start_i-1]
            start_i_qa += delta+1
            start_i_qb += delta
            qs = np.append(qs, q1[start_i_qa-1])
        elif gap_indA == len(align.seqA):
            break
        else:
            seq += align.seqA[start_i-1]
            print('Double gap found')

        # Final gap of the sequence B
        if set(align.seqA[start_i:]) == {'-'}:
            seq += align.seqB[start_i:]
            qs = np.append(qs, q2[start_i_qb:])
            #print(start_i, len(align.seqB))
            break
            
    return seq, qs

In [4]:
def build_seq(reads1, reads2, quals1, quals2, gap_penalty=-20):
    seqs = []
    av_qs = []
    align_errors = []
    align_scores = []
    #index_map = []

    for i in range(len(reads1)):
        read1, read2 = reads1[i], Seq.reverse_complement(reads2[i])
        qual1, qual2 = quals1[i], quals2[i][::-1]
        if set(read1) != {'A','C','G','T'} or set(read2) != {'A','C','G','T'}:
            align_errors.append(i)
            continue
            
        alignment = pairwise2.align.localms(read1, read2, 1, -1, gap_penalty, gap_penalty)[0]
        new_seq, new_qs = join_pairs(alignment, qual1, qual2)
        if new_seq == []:
            align_errors.append(i)
        else:
            seqs.append(new_seq)
            av_qs.append(new_qs.mean())
            #index_map.append(i)
            align_scores.append(alignment.score/(alignment.end-alignment.start))
            
    return (*filter_seqs(seqs, av_qs, align_scores), len(align_errors))


def filter_seqs(seqs, av_qs, align_scores):
    # Discarding for quality overlap and length window
    to_keep_q, to_keep_s, to_keep_l = set(), set(), set()
    for i in range(len(av_qs)):
        if av_qs[i] > q_threshold:
            to_keep_q.add(i)
        if align_scores[i] > s_threshold:
            to_keep_s.add(i)
        if len(seqs[i]) > len_bounds[0] and len(seqs[i]) < len_bounds[1]:
            to_keep_l.add(i)
    filt_seqs = np.take(seqs, list(to_keep_s.intersection(to_keep_q).intersection(to_keep_l)))

    # Discarding for singletons
    uniq_seqs, index, counts = np.unique(filt_seqs, return_counts=True, return_inverse=True)
    to_keep_sing = set()
    for i in range(len(index)):
        if counts[index[i]] > 1:
            to_keep_sing.add(i)
    final_seqs = np.take(filt_seqs, list(to_keep_sing))
    
    return final_seqs, len(seqs)-len(to_keep_l), len(seqs)-len(to_keep_q), len(seqs)-len(to_keep_s), len(filt_seqs)-len(to_keep_sing)

In [5]:
def group_and_export_seqs(seqs, name, directory):
    uniq_seqs, counts = np.unique(seqs, return_counts=True)
    f = open(directory+name+'.fasta', 'w')
    for i in range(len(counts)):
        f.write('>'+str(i+1)+'_'+str(counts[i])+'\n')
        f.write(uniq_seqs[i]+'\n')
    f.close()

In [6]:
gap_penalty = -20
q_threshold = 32
s_threshold = 0.66
len_bounds = [400, 425]
#good_lengths = [402, 405, 408, 411, 414, 417]

In [7]:
info = pd.DataFrame(columns=['id', 'N_seqs', 'N_discarderd_singl', 'N_discarderd_len', 'N_discarded_quality', \
                             'N_discarded_align', 'N_discarded_overl', 'av_gene_len', \
                             'std_gene_len', 'overl_threshold'])

for fastq in os.listdir('fastq/hiv/'):
    
    if fastq.split('_')[-1] == '1.fastq':
        id_ = fastq.split('_')[0]+'_'+fastq.split('_')[1]
        print(id_)
        
        if id_+'_hiv.fasta' in os.listdir('hiv_seqs/'):
            print('already present, skip.')
            continue
        
        name1, name2 = id_+'_hiv_1.fastq', id_+'_hiv_2.fastq'
        reads1, reads2, quals1, quals2 = import_pair_reads(name1, name2)
        seqs, l_fail, q_fail, overl_fail, sing_fail, a_fail = build_seq(reads1, reads2, quals1, quals2)
        
        info_d = {'id' : id_}
        info_d['N_seqs']= len(seqs)
        info_d['N_discarderd_len'] = l_fail
        info_d['N_discarderd_singl'] = sing_fail
        info_d['N_discarded_overl'] = overl_fail
        info_d['N_discarded_align'] = a_fail
        info_d['N_discarded_quality'] = q_fail
        lens = [len(s) for s in seqs]
        info_d['av_gene_len'] = np.mean(lens)
        info_d['std_gene_len'] = np.std(lens)
        info = info.append(info_d, ignore_index=True)
        
        group_and_export_seqs(seqs, id_+'_hiv', 'hiv_seqs/')

pat3_day2092
pat9_day1365
pat8_day1199
pat9_day1149
pat3_day436
pat1_day4903
pat1_day1474
pat1_day4015
pat4_day1266
pat9_day1281
pat2_day2542
pat1_day3301
pat7_day584
pat2_day2170
pat8_day72
pat9_day687
pat2_day4173
pat2_day1646
pat5_day1051
pat1_day2468
pat9_day778
pat6_day2808


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


pat1_day4498
pat4_day2275
pat1_day2314
pat4_day1615
pat2_day1002
pat4_day911
pat2_day1148
pat9_day862
pat1_day1803
pat7_day1093
pat7_day402
pat1_day1313
pat6_day1107
pat3_day268
pat2_day2700
pat2_day3834
pat6_day192
pat9_day449
pat2_day2829
pat8_day440
pat2_day1827
pat2_day1469
pat1_day3830
pat3_day1777
pat8_day196
pat5_day275
pat1_day2838
pat7_day948
pat5_day521
pat6_day434
pat7_day150
pat1_day3669
pat2_day3395
pat2_day4656
pat3_day1441
pat6_day773
pat8_day771
pat6_day3122
pat3_day97
pat3_day2624
pat8_day1112
pat7_day50
pat7_day1884
pat4_day175
pat7_day772
pat7_day2438
pat4_day764
pat1_day4183
pat10_day1229
pat2_day4002
pat5_day696
pat1_day1961
pat6_day2457
pat3_day742
pat1_day1646
pat5_day1324
pat7_day1261
pat9_day1022
pat8_day961
pat6_day1785
pat4_day427
pat9_day79
pat1_day3207
pat4_day1954
pat2_day2997
pat2_day3169
pat4_day1777
pat3_day945
pat9_day189
pat1_day1144
pat10_day1397
pat5_day1205
pat6_day2114
pat1_day4660
pat8_day1644
pat8_day1359
pat1_day3516
pat5_day863
pat5_day191
pat

In [8]:
info.to_csv('hiv_info/hiv_seqs_info.tsv', sep='\t')

In [14]:
old_info = pd.read_csv('hiv_info/hiv_seqs_info.tsv', sep='\t')
old_info = old_info.set_index('id').drop('Unnamed: 0', axis=1)

In [None]:
info = info.loc[set(info.index).difference(set(old_info.index))]

In [45]:
info = old_info.loc[['pat6_day2808', 'pat2_day3169']]
info = info.rename(index={'pat6_day2808':'bubu'})

In [53]:
info = info.loc[set(info.index).difference(set(old_info.index))]

In [54]:
len(pd.concat([info, old_info])), len(old_info)

(107, 106)

In [3]:
import utils as ut

seqs = ut.import_fasta('hiv_seqs')

In [4]:
seqs.keys()

dict_keys(['4_175', '8_1359', '3_1777', '4_764', '5_191', '2_2700', '1_3830', '4_2275', '2_2829', '6_773', '7_772', '2_1827', '5_863', '10_1229', '5_1205', '9_687', '5_1324', '7_1261', '7_50', '1_1961', '3_97', '1_4498', '1_3301', '8_1644', '2_2542', '2_1469', '1_1646', '6_2114', '2_4173', '3_2092', '7_402', '7_1884', '5_1051', '1_2838', '1_1144', '9_1149', '1_3516', '8_603', '9_189', '2_2997', '3_742', '3_2624', '5_521', '8_72', '2_3169', '9_1022', '2_2170', '3_945', '8_196', '8_1199', '6_2808', '1_4183', '3_1267', '4_1266', '3_1441', '1_4015', '7_2438', '2_1002', '1_3669', '2_3834', '1_2468', '7_150', '9_449', '9_79', '5_696', '1_1474', '7_1093', '6_3122', '6_1785', '1_4903', '8_961', '1_1803', '8_771', '3_268', '9_1365', '2_3395', '6_1448', '6_2457', '10_1397', '2_1989', '1_3207', '8_1112', '4_911', '6_192', '2_1148', '1_1313', '9_1281', '4_1777', '7_584', '9_862', '1_4660', '9_778', '2_1646', '5_361', '8_440', '4_1954', '5_275', '3_436', '6_434', '7_948', '4_1615', '2_4002', '4_427

In [19]:
exe = seqs['1_1144']
stop_codons = {'TAG', 'TAA', 'TGA'}
stop_codons_ar = [np.array(list(c)) for c in stop_codons]

In [71]:
l = list(exe.values())[0]


In [88]:
resize_bounds = [
    [[0,0], [1,-2], [2,-1]], # Bounds of the 3 frames for length % 3 = 0
    [[0,-1], [1,0], [2,-2]], # Bounds of the 3 frames for length % 3 = 1
    [[0,-2], [1,-1], [2,0]], # Bounds of the 3 frames for length % 3 = 2
]

def get_n_codons_3frames(arr_seq, arr_codon):
    l = len(arr_seq)
    mod3 = l%3
    n_codons = []
    for fr in range(3): # Iter over the three frames
        bound = resize_bounds[mod3][fr]
        print(l, fr, bound)
        tri_seq = arr_seq[bound[0]:l+bound[1]]
        tri_seq = tri_seq.reshape((int(len(tri_seq)/3), 3))
        n_codons.append(np.sum(np.all(tri_seq == arr_codon, axis=1)))
    return n_codons

In [91]:
get_n_codons_3frames(np.array(list(l)), stop_codons_ar[2])

411 0 [0, 0]
411 1 [1, -2]
411 2 [2, -1]


[1, 1, 0]