In [1]:
from Bio import Entrez, SeqIO
from Bio.Seq import Seq

from Bio.SeqRecord import SeqRecord
from Bio import Entrez

import pysam
import vcf
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import functools
import re

import numpy as np
import pandas as pd
import math

Entrez.email = "simon.burgermeister@gmail.com"

# known variant:
#https://www.nature.com/articles/s41598-020-70812-6
#https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/959438/Technical_Briefing_VOC_SH_NJL2_SH2.pdf

In [2]:
## Annotations from reference sequence:

ref_id='NC_045512'
Entrez.email='simon.burgermeister@gmail.com'

handle = Entrez.efetch(db='nucleotide', id=ref_id, rettype='gb')
record = SeqIO.read(handle, "genbank")
handle.close()
ref_seq_str=str(record.seq)

dic_map={}
dic_seq={}
ref_df=[]
for feat in record.features:
    #print(feat.type)
    if feat.type=='CDS': #'gene'
        #print(feat.qualifiers['gene'][0])
        dic_map[feat.qualifiers['gene'][0]]=[int(feat.location.start), int(feat.location.end)]
        
        prot_seq=feat.qualifiers['translation'][0]
        rna_seq=str(record.seq)[int(feat.location.start): int(feat.location.end)]
        dic_seq[feat.qualifiers['gene'][0]]=rna_seq
        ref_df.append([feat.qualifiers['gene'][0], int(feat.location.start), int(feat.location.end)])
        #break
        
    #Mature peptides:    
    elif feat.type=='mat_peptide':
        
        prod=feat.qualifiers['product']
        locus_tag=feat.qualifiers['locus_tag']
        protein_id=feat.qualifiers['protein_id']
        end=int(feat.location.end)
        start=int(feat.location.start)
        
    
    # Stem loops:
    elif feat.type=='stem_loop':
        end=int(feat.location.end)
        start=int(feat.location.start)
        
        function=feat.qualifiers['function'][0]
        #note=feat.qualifiers['note'][0]
        
    #5'UTR 3'UTR      
print(dic_map)
df_refs=pd.DataFrame(ref_df, columns=['gene', 'start', 'stop'])
df_refs.head()

{'ORF1ab': [265, 13483], 'S': [21562, 25384], 'ORF3a': [25392, 26220], 'E': [26244, 26472], 'M': [26522, 27191], 'ORF6': [27201, 27387], 'ORF7a': [27393, 27759], 'ORF7b': [27755, 27887], 'ORF8': [27893, 28259], 'N': [28273, 29533], 'ORF10': [29557, 29674]}


Unnamed: 0,gene,start,stop
0,ORF1ab,265,21555
1,ORF1ab,265,13483
2,S,21562,25384
3,ORF3a,25392,26220
4,E,26244,26472


In [59]:
f = vcf.Reader(filename='sequences3/var.raw.vcf')

my_type = defaultdict(int)
num_alts = defaultdict(int)

for rec in f:
    my_type[rec.var_type, rec.var_subtype] += 1
    #print(rec.var_type)
    if rec.is_snp:
        num_alts[len(rec.ALT)] += 1
    else:
        print(rec)
print(my_type)
print(num_alts)

Record(CHROM=NC_045512.2, POS=11287, REF=GTCTGGTTTT, ALT=[G])
Record(CHROM=NC_045512.2, POS=21764, REF=ATACATGT, ALT=[AT])
Record(CHROM=NC_045512.2, POS=21990, REF=TTTATTA, ALT=[TTTA])
Record(CHROM=NC_045512.2, POS=28270, REF=TAAAA, ALT=[TAAA])
defaultdict(<class 'int'>, {('snp', 'ts'): 21, ('snp', 'tv'): 15, ('indel', 'del'): 4})
defaultdict(<class 'int'>, {1: 36})


In [60]:
samfile = pysam.AlignmentFile("sequences3/sorted.bam", "r")

jj=0
data=[]

for x in samfile:
    
    if jj>20:
        break
    else:
        
        
        #read=str(x)
        rr=read.split('\t0\t0')
        read_ID=rr[0]
        cig=x.cigarstring
        read_seq=str(x.seq) 
        match = re.findall(r'(\d+)(\w)',cig)
        s_start=x.reference_start
        data2=[read_ID]
        colx=['ID']
        
        #print(read_ID)
        #print(cig)
        #print(' ')
        if 'D' in cig:
            ind=s_start
            lst_del=[]
            for m in match:
                if m[1]=='M':
                    ind=ind+int(m[0])
                elif m[1]=='D':
                    #print([ind+1, ind+int(m[0])+1])
                    lst_del.append([ind+1, ind+int(m[0])+1, m[0]])
                    ind=ind+int(m[0])
                    
            del_df=pd.DataFrame(lst_del, columns=['start', 'stop', 'length'])
            del_df=del_df.astype(int)
        else:
            del_df=pd.DataFrame([], columns=['start', 'stop', 'length'])
            
            
        f = vcf.Reader(filename='sequences3/var.raw.vcf')
        for rec in f:
            if rec.is_snp:
                p=rec.POS
                
                ref=rec.REF
             
                df_loc=df_refs.loc[(df_refs['start']<p)&(df_refs['stop']>p)]
                
                #if SNP is in a known gene:
                if len(df_loc)>0: # SNP in a known gene
                    # Gene considered (ORF):
                    gene=df_loc['gene'].values[0]
                    
                    #shifts
                    shift_start=np.sum(del_df.loc[del_df['stop']<df_loc['start'].values[0]]['length'].values)+x.reference_start
                    shift_snp=np.sum(del_df.loc[del_df['stop']<p]['length'].values)+x.reference_start
                    shift_stop=np.sum(del_df.loc[del_df['stop']<df_loc['stop'].values[0]]['length'].values)+x.reference_start
        
                    #sequences:
                    
                    sseq=dic_seq[gene]
                    sseq2=read_seq[dic_map[gene][0]-shift_start: dic_map[gene][1]-shift_stop]
                    seq3=ref_seq_str[dic_map[gene][0]:dic_map[gene][1]]
                    
                    # SNP position in gene (0 indexing):
                    snp_pos_ref=p-df_loc['start'].values[0]-1
                    snp_pos_read=p-shift_snp-df_loc['start'].values[0]+shift_start-1
                    
                    
                    #deletions in gene but before SNP:
                    #del_gene=df_refs.loc[(df_refs['start']>df_loc['start'].values[0])&(df_refs['stop']<p)]
                    
                    
                    # Amino acid positions:
                    aa_p_ref=math.floor(snp_pos_ref/3)+1
                    aa_p_seq=math.floor(snp_pos_read/3)+1
                    
                    # Codons:
                    codon_ref=sseq[aa_p_ref*3-3:aa_p_ref*3]
                    codon_read=sseq2[aa_p_seq*3-3:aa_p_seq*3]

                    codon_ref2=Seq(codon_ref)
                    codon_read2=Seq(codon_read)

                    aa_ref=str(codon_ref2.translate())
                    aa_read=str(codon_read2.translate())
                    
                    cc=gene+'_'+str(aa_p_ref)
                    if cc not in colx:
                        colx.append(cc)
                        if aa_ref !=aa_read:

                            genotype=aa_ref+str(aa_p_ref)+aa_read
                            data2.extend([genotype])
                        else:
                            data2.extend(['WT'])
        
        
        #INDELs:
        for i in range(0, len(del_df)):
            start=del_df.iloc[i]['start']
            stop=del_df.iloc[i]['stop']
            print(start)
            print(stop)
            df_loc=df_refs.loc[(df_refs['start']<start)&(df_refs['stop']>stop)]
            if len(df_loc)>0:
                gene=df_loc['gene'].values[0]
                print(gene)
                gene_start=df_loc['start'].values[0]
                gene_stop=df_loc['start'].values[0]
                
                codon_indel=ref_seq_str[start-1: stop-1]
                print(codon_indel)
                codon_indel2=Seq(codon_indel)
                print(codon_indel2.translate())
                print('')
                
                
                
            
                            
    data.append(data2)
                    
                    

11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272
11288
11297
ORF1ab
TCTGGTTTT
SGF

21765
21771
S
TACATG
YM

21991
21994
S
TTA
L

28271
28272


In [61]:
del_df.head()

Unnamed: 0,start,stop,length
0,11288,11297,9
1,21765,21771,6
2,21991,21994,3
3,28271,28272,1


In [None]:
#TCTGGTTTT
#ACATGT
#TTA
#A



In [None]:
#Record(CHROM=NC_045512.2, POS=11287, REF=GTCTGGTTTT, ALT=[G])
#Record(CHROM=NC_045512.2, POS=21764, REF=ATACATGT, ALT=[AT])
#Record(CHROM=NC_045512.2, POS=21990, REF=TTTATTA, ALT=[TTTA])
#Record(CHROM=NC_045512.2, POS=28270, REF=TAAAA, ALT=[TAAA])

In [58]:

a=Seq(ref_seq_str[11287:11296])
print(a)
a.translate()

TCTGGTTTT


Seq('SGF')

In [57]:
a=Seq(ref_seq_str[21763:21769])
a.translate()

Seq('IH')

In [55]:
a=Seq(ref_seq_str[21991:21994])
a.translate()

Seq('Y')

In [47]:
del_df.head(10)

Unnamed: 0,start,stop,length
0,11288,11297,9
1,21765,21771,6
2,21991,21994,3
3,28271,28272,1


In [34]:
df_loc['start'].values[0]

27893

In [62]:
df_genotype=pd.DataFrame(data, columns=colx)
df_genotype.head(20)

Unnamed: 0,ID,ORF1ab_216,ORF1ab_265,ORF1ab_615,ORF1ab_924,ORF1ab_1001,ORF1ab_1708,ORF1ab_1907,ORF1ab_2230,ORF1ab_2259,...,ORF3a_57,M_70,ORF8_27,ORF8_52,ORF8_68,ORF8_73,N_3,N_203,N_204,N_235
0,MW913363.1,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,WT,...,WT,V70L,Q27*,R52I,WT,Y73C,D3L,R203K,G204R,S235F
2,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,WT,...,WT,WT,Q27*,R52I,K68*,Y73C,D3L,R203K,G204R,S235F
3,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,WT,...,WT,V70L,Q27*,R52I,WT,Y73C,D3L,R203K,G204R,S235F
4,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,WT,...,WT,WT,Q27*,R52I,WT,Y73C,D3L,R203K,G204R,S235F
5,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,M2259I,...,WT,WT,Q27*,R52I,K68*,Y73C,D3L,R203K,G204R,S235F
6,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,M2259I,...,WT,WT,Q27*,R52I,K68*,Y73C,D3L,R203K,G204R,S235F
7,MW913363.1,WT,WT,WT,WT,T1001I,A1708D,WT,I2230T,WT,...,WT,WT,Q27*,R52I,K68*,Y73C,D3L,R203K,G204R,S235F
8,MW913363.1,WT,T265I,WT,WT,WT,WT,WT,WT,WT,...,Q57H,WT,WT,WT,WT,WT,WT,WT,WT,WT
9,MW913363.1,WT,T265I,WT,WT,WT,WT,WT,WT,WT,...,Q57H,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [29]:
for i in range(0, len(del_df)):
    print(del_df.iloc[i]['start'])

11288
21756
21976
28253


In [25]:
samfile = pysam.AlignmentFile("sequences/sorted.bam", "r")

xx=0
for x in samfile:
     xx=xx+1
print(xx)

169286
