In [109]:
import os
import sys
import vcf
import glob
import pandas as pd

## Custom parsers

### Fasta parser

In [110]:
import os
import re
import sys
import time
import glob
import numpy
import argparse
from operator import attrgetter
from collections import namedtuple
from collections import OrderedDict

class Fasta:

	def __init__(self, fasta_path):
		self.fasta_path = fasta_path
		return

	def peek(self, fasta_handle):
		curr_pos = fasta_handle.tell()
		curr_line = fasta_handle.readline()
		fasta_handle.seek(curr_pos)
		return(curr_line)

	def read(self):
		'''Given a fasta file, read will iterate through the file and yield each record, alonf with the header information, custom fasta id, length of sequence.'''
		fasta_handle = open(self.fasta_path)

		#Read the file one line at a time and process a chunk of text until the next header is found. At this point, process the chunk of text as one sequence and create iterator.
		next_line = ''
		line_number = 0
		header = ''
		sequence = ''
		header_found = False
		fid = 0
		while True:
			fasta = namedtuple('fastaRec', ['header', 'seq', 'fid', 'length'])
			next_line = self.peek(fasta_handle)
			try:
					if next_line[0] == '>':
						header = fasta_handle.readline().strip()[1:]
					sequence = ''
					fid += 1
					header_found = True
					line_number += 1
					while True:
						try:
								if self.peek(fasta_handle)[0] != '>' and self.peek(fasta_handle) != ' \n': 
									sequence += fasta_handle.readline().strip()
									header_found = False
									line_number += 1

								elif (self.peek(fasta_handle) == ' \n' or self.peek(fasta_handle)[0] == '>') and header_found:
									line_number += 1
									raise SyntaxError('Sequence missing for header : {0} at line {1}'.format(header, line_number))
									sys.exit()
								elif self.peek(fasta_handle)[0] == '>' and not header_found:
									break					
						except IndexError:
								break								
					length = len(sequence)
					record = fasta(header, sequence, fid, length)
					yield record
			except IndexError:
				break					

	def write(self, out_path, reader_obj, wrapping = 0):
		'''Given a fasta object and a output path, will write out a fasta file.'''

		fasta_handle = open(out_path, 'w')
		seq = ''
		for sequences in reader_obj:
			if wrapping == 0:
				wrapping = len(sequences.seq)
			seq = re.findall('.{{1,{0}}}'.format(wrapping), sequences.seq)
			fasta_handle.write('>{0}|{1}|{2}\n'.format(sequences.header, sequences.length, len(''.join(seq))))
			for record in seq:
				fasta_handle.write('{0}\n'.format(record))
		fasta_handle.close()
		return					


### Codon table

In [190]:
def getAA(codon):
    codontable = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
        }
    return(codontable[codon])

def getCodon(aa):
    aatable= {
        'I' : 'ATA,ATC,ATT',
        'M' : 'ATG',
        'T' : 'ACA,ACC,ACG,ACT',
        'N' : 'AAC,AAT',
        'K' : 'AAA,AAG',
        'S' : 'AGC,AGT',
        'R' : 'AGA,AGG',
        'L' : 'CTA,CTC,CTG,CTT',
        'P' : 'CCA,CCC,CCG,CCT',
        'H' : 'CAC,CAT',
        'Q' : 'CAA,CAG',
        'R' : 'CGA,CGC,CGG,CGT',
        'V' : 'GTA,GTC,GTG,GTT',
        'A' : 'GCA,GCC,GCG,GCT',
        'D' : 'GAC,GAT',
        'E' : 'GAA,GAG',
        'G' : 'GGA,GGC,GGG,GGT',
        'S' : 'TCA,TCC,TCG,TCT',
        'F' : 'TTC,TTT',
        'L' : 'TTA,TTG',
        'Y' : 'TAC,TAT',
        'C' : 'TGC,TGT',
        '_' : 'TAA,TAG,TGA',
        'W' : 'TGG'
        
    }
    return(aatable[aa])

## Format variants of interest data

In [172]:
voi = pd.read_excel('./Reportable_SNPs_Report_v2.xlsx', sheetname=1, parse_cols="B:C")
voi.head()

Unnamed: 0,Gene,SNP
0,PfCRT,M74I
1,PfCRT,N75E
2,PfCRT,K76T
3,PfCRT,A220S
4,PfCRT,Q271E


In [173]:
voi['Ref'] = voi['SNP'].str.split('\d+', expand=True)[0]
voi['Alt'] = voi['SNP'].str.split('\d+', expand=True)[1]
voi['Pos'] = voi['SNP'].str.split('[a-zA-Z]',expand=True)[1]
voi = voi[['Gene','Pos','Ref','Alt']]
voi[['Pos']] = voi[['Pos']].apply(pd.to_numeric)
voi.set_index(['Gene','Pos'], inplace=True)
voi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Ref,Alt
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1
PfCRT,74,M,I
PfCRT,75,N,E
PfCRT,76,K,T
PfCRT,220,A,S
PfCRT,271,Q,E


## Create codon table for genes

In [210]:
# Read fasta file
reader = Fasta('../../ref/mdr.fa')

#Read bed file
bed = pd.read_table('../../ref/mdr.bed', header=None, sep='\t', names=['Gene','Start','Stop','Exon','Info','Strand'])
bed.tail()

Unnamed: 0,Gene,Start,Stop,Exon,Info,Strand
17,DHPS,1,136,exon1,.,+
18,DHPS,313,2181,exon2,.,+
19,DHPS,2302,2417,exon3,.,+
20,K13,1,2181,exon1,.,+
21,PfMDR1,1,4260,exon1,.,+


In [211]:
#Spliting in codons
bed_iter = iter(bed.iterrows())
fasta = reader.read()


In [212]:
codon_dict = dict()
codon = list()
codon_table = list()
for records in fasta:
    seq = iter(records.seq)
    cdna = []
    for pos, base in enumerate(seq, 1):
        for exons in bed.iterrows():
            if exons[1]['Gene'] == records.header:
                if pos in range(exons[1]['Start'], exons[1]['Stop']+1):
                    cdna.append((base, pos, exons[1]['Strand']))
                else:
                    continue
            else:
                continue
    aa = []
    codon = 0
    sequence = cdna #iter(cdna)
    for bases in range(0, len(cdna),3):
        codon += 1
        codon_list = cdna[bases:bases+3] if cdna[bases][2] == '+' else cdna[bases:bases+3][::-1]
        codon_seq = ''.join([val[0] for val in codon_list])
        for nuc in codon_list:
            codon_table.append((records.header, (records.header, nuc[1], nuc[0], codon, codon_seq, getAA(codon_seq))))
        
codon_data = pd.DataFrame.from_items(codon_table, orient='index', columns=['Gene', 'NucPos', 'Nuc', 'Pos', 'Codon', 'AA'])                        
codon_data[['Pos']] = codon_data[['Pos']].apply(pd.to_numeric)
codon_data.set_index(['Gene', 'Pos'], inplace=True)
codon_data.head()
    #while True:
    #    try:
    #        codon += 1
    #        codon_seq = next(sequence[0])+next(sequence[0])+next(sequence[0])
    #        aa_seq = getAA(codon_seq)
    #        aa.append((codon_seq, codon, getAA(codon_seq)))
    #    except StopIteration:
    #       break
    #codon_dict[records.header] = aa




Unnamed: 0_level_0,Unnamed: 1_level_0,NucPos,Nuc,Codon,AA
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PfCRT,1,96,A,ATG,M
PfCRT,1,97,T,ATG,M
PfCRT,1,98,G,ATG,M
PfCRT,2,99,A,AAA,K
PfCRT,2,100,A,AAA,K


In [213]:
codons_of_int = voi.merge(codon_data, left_index=True, right_index=True, how='inner' )

In [214]:
codons_of_int

Unnamed: 0_level_0,Unnamed: 1_level_0,Ref,Alt,NucPos,Nuc,Codon,AA
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DHFR,51,N,I,151,A,AAT,N
DHFR,51,N,I,152,A,AAT,N
DHFR,51,N,I,153,T,AAT,N
DHFR,59,C,R,175,T,TGT,C
DHFR,59,C,R,176,G,TGT,C
DHFR,59,C,R,177,T,TGT,C
DHFR,108,S,N,322,A,AGC,S
DHFR,108,S,N,323,G,AGC,S
DHFR,108,S,N,324,C,AGC,S
DHPS,436,S,A,1482,T,TCT,S


In [215]:
codons_of_int['AltCodon'] = codons_of_int['Alt'].apply(getCodon)

In [216]:
codons_of_int.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Ref,Alt,NucPos,Nuc,Codon,AA,AltCodon
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DHFR,51,N,I,151,A,AAT,N,"ATA,ATC,ATT"
DHFR,51,N,I,152,A,AAT,N,"ATA,ATC,ATT"
DHFR,51,N,I,153,T,AAT,N,"ATA,ATC,ATT"
DHFR,59,C,R,175,T,TGT,C,"CGA,CGC,CGG,CGT"
DHFR,59,C,R,176,G,TGT,C,"CGA,CGC,CGG,CGT"


In [217]:
codons_of_int.to_excel('Codon_table.xlsx')


In [281]:
sample = vcf.Reader(filename='../../local/Sample_5685_F_KEL5430A24/Sample_5685_F_KEL5430A24_variants_merged_annotated.vcf')
codon_table = pd.read_excel('Codon_table.xlsx')
sample_name = sample.samples[0]

In [282]:
codons_of_int.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Ref,Alt,NucPos,Nuc,Codon,AA,AltCodon,Sample_5685_F_KEL5430A24
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DHFR,51,N,I,151,A,AAT,N,"ATA,ATC,ATT",
DHFR,51,N,I,152,A,AAT,N,"ATA,ATC,ATT",
DHFR,51,N,I,153,T,AAT,N,"ATA,ATC,ATT",
DHFR,59,C,R,175,T,TGT,C,"CGA,CGC,CGG,CGT",
DHFR,59,C,R,176,G,TGT,C,"CGA,CGC,CGG,CGT",


In [288]:
sample_result = []
for val in codons_of_int.iterrows():
    sample = vcf.Reader(filename='../../local/Sample_5685_F_KEL5430A24/Sample_5685_F_KEL5430A24_variants_merged_annotated.vcf')
    sample_name = sample.samples[0]
    sample_vars = ['{0}_{1}_{2}_{3}'.format(var.CHROM, var.POS, var.INFO['RefAA'][0], var.INFO['AltAA'][0]) for var in sample]
    if '{0}_{1}_{2}_{3}'.format(val[0][0], val[1]['NucPos'], val[1]['Ref'],val[1]['Alt']) in sample_vars:
        sample_result.append(1)
    else:
        sample_result.append(0)
sample_series = pd.Series(sample_result, index=codons_of_int.index)
codons_of_int[sample_name] = sample_series


In [289]:
codons_of_int

Unnamed: 0_level_0,Unnamed: 1_level_0,Ref,Alt,NucPos,Nuc,Codon,AA,AltCodon,Sample_5685_F_KEL5430A24
Gene,Pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DHFR,51,N,I,151,A,AAT,N,"ATA,ATC,ATT",0
DHFR,51,N,I,152,A,AAT,N,"ATA,ATC,ATT",1
DHFR,51,N,I,153,T,AAT,N,"ATA,ATC,ATT",0
DHFR,59,C,R,175,T,TGT,C,"CGA,CGC,CGG,CGT",1
DHFR,59,C,R,176,G,TGT,C,"CGA,CGC,CGG,CGT",0
DHFR,59,C,R,177,T,TGT,C,"CGA,CGC,CGG,CGT",0
DHFR,108,S,N,322,A,AGC,S,"AAC,AAT",0
DHFR,108,S,N,323,G,AGC,S,"AAC,AAT",1
DHFR,108,S,N,324,C,AGC,S,"AAC,AAT",0
DHPS,436,S,A,1482,T,TCT,S,"GCA,GCC,GCG,GCT",0
