In [1]:
import glob
import os
import numpy as np
import pandas as pd
import scipy.stats as st
import vcf
import vcf.filters
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import gzip
import shutil
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Extracting and analyzing data from the VCF files

## Don't need to do any QC filtering because that's for reads from FASTQ files

## These are all post-QC sequences in FASTA files

In [2]:
def parse_vcf_snps(fName, strain_diff_df, strain_aln, other_strain):
    
    if "G_" in fName:
        chrom = "gene=G"
        protein = "G"
    elif "P_" in fName:
        chrom = "gene=P"
        protein = "P"
        
    strain_diff_protein = strain_diff_df.query("Protein==@protein").reset_index(drop=True)
    
    if strain_aln not in ["BGD", "MYS"]:
        raise ValueError(f"{strain_aln} is not a valid strain name!")
    if other_strain not in ["BGD", "MYS"]:
        raise ValueError(f"{other_strain} is not a valid strain name!")
    
    pos = []
    ref = []
    alt = []
    strain_diffs = []
    
    vcf_reader = vcf.Reader(open(fName, 'r'))

    for record in vcf_reader:
        
        # check that all variants are coming from the correct gene. Might be overkill, but just another sanity check
        if chrom not in record.CHROM:
            raise ValueError(f"SNP found outside of {chrom}")
            
        # convert form VCF objects to strings to do string comparison
        alt_alleles = [str(char) for char in record.ALT]

        if "<*>" in alt_alleles:
            alt_alleles.remove("<*>")
        alt_alleles = "".join(alt_alleles)
            
        pos.append(record.POS)
        ref.append(record.REF)
        alt.append(alt_alleles)
        
        if record.POS in strain_diff_protein.Position.values:
            pos_of_interest = strain_diff_protein.loc[strain_diff_protein.Position==record.POS, :]
        
            if record.REF == pos_of_interest[strain_aln].values[0] and len(alt_alleles) == 1:
                if alt_alleles == pos_of_interest[other_strain].values[0]:
                    strain_diffs.append(1)
                else:
                    strain_diffs.append(0)
            else:
                strain_diffs.append(0)
        else:
            strain_diffs.append(0)
        
    return pd.DataFrame({"pos": pos, "ref": ref, "alt": alt, "strain_diff": strain_diffs, "aln": strain_aln})

In [3]:
def parse_vcf_and_metrics(fName, strain_diff_df, strain_aln, other_strain):
    
    res = parse_vcf_snps(fName, strain_diff_df, strain_aln, other_strain)
    non_strain_diffs = len(res.query("strain_diff==0"))/len(res)
    
    print(f"{non_strain_diffs} of SNPs are not strain-level variations")
    
    return res

In [4]:
g_bgd = parse_vcf_and_metrics('alignments/G_BGD_SNPs.vcf', strain_diff, "BGD", "MYS")
g_mys = parse_vcf_and_metrics('alignments/G_MYS_SNPs.vcf', strain_diff, "MYS", "BGD")

p_bgd = parse_vcf_and_metrics('alignments/P_BGD_SNPs.vcf', strain_diff, "BGD", "MYS")
p_mys = parse_vcf_and_metrics('alignments/P_MYS_SNPs.vcf', strain_diff, "MYS", "BGD")

NameError: name 'strain_diff' is not defined

### Not sure what the difference in proportion between BGD and MYS means. 

### It is already ignoring strain-level variations (comparing the 2 reference genomes)

In [6]:
glyco_bgd_ref = [seq_record.seq for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD_CDS.fna", "fasta") if "gene=G" in seq_record.id][0]
glyco_mys_ref = [seq_record.seq for seq_record in SeqIO.parse("sequences/genome/NC_00278.1_MYS_CDS.fna", "fasta") if "gene=G" in seq_record.id][0]

phospho_bgd_ref = [seq_record.seq for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD_CDS.fna", "fasta") if "gene=P" in seq_record.id][0]
phospho_mys_ref = [seq_record.seq for seq_record in SeqIO.parse("sequences/genome/NC_00278.1_MYS_CDS.fna", "fasta") if "gene=P" in seq_record.id][0]

In [7]:
print(len(glyco_bgd_ref), len(glyco_mys_ref))
print(len(phospho_bgd_ref), len(phospho_mys_ref))

1809 1809
2130 2130


In [8]:
glyco_snps = pd.concat([g_bgd.query("strain_diff==0"), g_mys.query("strain_diff==0")]).reset_index(drop=True)
phospho_snps = pd.concat([p_bgd.query("strain_diff==0"), p_mys.query("strain_diff==0")]).reset_index(drop=True)

del glyco_snps["strain_diff"]
del phospho_snps["strain_diff"]

# lengths of all sequences, sanity check again
assert glyco_snps.pos.max() <= 1809 and phospho_snps.pos.max() <= 2130

In [8]:
def get_codon(pos, ref_seq, alt):
    
    # create the mutated sequence, strings and Seqs are immutable
    alt_seq = list(ref_seq)
    alt_seq[pos-1] = alt
    alt_seq = Seq("".join(alt_seq))
    
    # position is 1-indexed, make sure to use 0-indexing here
    if pos % 3 == 0:
        codon = ref_seq[pos-3:pos]
        alt_codon = alt_seq[pos-3:pos]
    elif pos % 3 == 1:
        codon = ref_seq[pos-1:pos+2]
        alt_codon = alt_seq[pos-1:pos+2]
    else:
        codon = ref_seq[pos-2:pos+1]
        alt_codon = alt_seq[pos-2:pos+1]
        
    return codon.translate(), alt_codon.translate()


def get_aa_num(pos):
    
    if pos < 1:
        raise ValueError("Position must be a natural number!")
    
    if pos % 3 == 0:
        return int(pos / 3)
    else:
        return int(pos / 3) + 1

In [10]:
for i, row in glyco_snps.iterrows():
    
    if row["aln"] == "BGD":
        ref_name = glyco_bgd_ref
    else:
        ref_name = glyco_mys_ref
        
    codon_change = get_codon(row["pos"], ref_name, row["alt"])
    glyco_snps.loc[i, "ref_aa"] = str(codon_change[0])
    glyco_snps.loc[i, "alt_aa"] = str(codon_change[1])
    
glyco_snps["synonymous"] = (glyco_snps["ref_aa"] == glyco_snps["alt_aa"]).astype(int)

# add the amino acid position (1-indexed)
glyco_snps["aa_pos"] = [get_aa_num(pos) for pos in glyco_snps["pos"]]

# glyco_snps.to_csv("snp_calling/glyco.csv", index=False)

In [11]:
for i, row in phospho_snps.iterrows():
    
    if row["aln"] == "BGD":
        ref_name = phospho_bgd_ref
    else:
        ref_name = phospho_mys_ref
        
    codon_change = get_codon(row["pos"], ref_name, row["alt"])
    phospho_snps.loc[i, "ref_aa"] = str(codon_change[0])
    phospho_snps.loc[i, "alt_aa"] = str(codon_change[1])
    
phospho_snps["synonymous"] = (phospho_snps["ref_aa"] == phospho_snps["alt_aa"]).astype(int)

# add the amino acid position (1-indexed)
phospho_snps["aa_pos"] = [get_aa_num(pos) for pos in phospho_snps["pos"]]

#phospho_snps.to_csv("snp_calling/phospho.csv", index=False)

In [None]:
# vcf_reader = vcf.Reader(open("alignments/G_BGD_SNPs.vcf", "r"))

# for record in vcf_reader:
    
#     # convert form VCF objects to strings
#     alt_alleles = [str(char) for char in record.ALT]
    
#     if "<*>" in alt_alleles:
#         alt_alleles.remove("<*>")
#         print(alt_alleles)
#     # else:
#     #     print(alt_alleles)

# Further investigation of SNPs

In [31]:
glyco_snps = pd.read_csv("snp_calling/glyco.csv")
phospho_snps = pd.read_csv("snp_calling/phospho.csv")

In [34]:
glyco_snps.query("synonymous==0")

Unnamed: 0,pos,ref,alt,aln,ref_aa,alt_aa,synonymous,aa_pos
0,8,C,T,BGD,T,I,0,3
9,199,A,G,BGD,I,V,0,67
11,246,G,A,BGD,M,I,0,82
29,625,G,A,BGD,V,I,0,209
31,682,T,C,BGD,Y,H,0,228
32,734,C,T,BGD,S,F,0,245
35,889,G,A,BGD,V,M,0,297
36,910,A,G,BGD,I,V,0,304
44,1127,A,C,BGD,K,T,0,376
46,1142,A,G,BGD,N,S,0,381


In [None]:
# Mapping to use for one-hot encoding. No nucleotide = missing because already checked that there are no indels in these seque
BASE_TO_COLUMN = {'A': 0, 'C': 1, 'T': 2, 'G': 3, 'N': 4}
COLUMN_TO_BASE = {0: "A", 1: "C", 2: "T", 3: "G"}

# Get one hot vector
def get_one_hot(sequence):

    if not pd.isnull(sequence):
        seq_in_index = np.array([BASE_TO_COLUMN.get(b, b) for b in sequence])

        one_hot = np.zeros((len(sequence), 4))

        # only get the non gap sites because otherwise we get indexing errors
        ind_non_gap = np.where(seq_in_index != 4)[0]

        # Assign the found positions to 1
        one_hot[ind_non_gap, seq_in_index[ind_non_gap]] = 1

        return one_hot
    else:
        return np.nan

# metadata file previously generated for all sequences
metadata = pd.read_csv("metadata_all.csv")