In [1]:
import glob
import os
import numpy as np
import pandas as pd
import scipy.stats as st
import vcf
import vcf.filters
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import gzip
import shutil
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Extracting and analyzing data from the VCF files

## Don't need to do any QC filtering because that's for reads from FASTQ files

## These are all post-QC sequences in FASTA files

```bash
bash snp_calling/call_snps.sh sequences/genome/AY988601.1_BGD_CDS.fasta seq_for_analysis/P_whitmer_BGD.fasta alignments P_whitmer_BGD
```

In [25]:
def parse_vcf_snps(fName):
    
    if "G_" in fName:
        chrom = "gene=G"
        protein = "G"
    elif "P_" in fName:
        chrom = "gene=P"
        protein = "P"
    
    pos = []
    ref = []
    alt = []    
    
    vcf_reader = vcf.Reader(open(fName, 'r'))

    for record in vcf_reader:
        
        # check that all variants are coming from the correct gene. Might be overkill, but just another sanity check
        if chrom not in record.CHROM or record.CHROM != chrom:
            raise ValueError(f"SNP found outside of {chrom}")
            
        # convert form VCF objects to strings to do string comparison
        alt_alleles = [str(char) for char in record.ALT]

        if "<*>" in alt_alleles:
            alt_alleles.remove("<*>")
        alt_alleles = "".join(alt_alleles)
            
        pos.append(record.POS)
        ref.append(record.REF)
        alt.append(alt_alleles)
        
    return pd.DataFrame({"pos": pos, "ref": ref, "alt": alt})

In [26]:
p_bgd = parse_vcf_snps("alignments/P_whitmer_BGD_SNPS.vcf")

In [27]:
p_bgd

Unnamed: 0,pos,ref,alt
0,99,T,C
1,167,C,T
2,175,G,A
3,181,G,A
4,190,T,C
...,...,...,...
83,2094,A,T
84,2097,T,C
85,2101,G,A
86,2112,C,T


In [30]:
P_whitmer = [(seq.id, str(seq.seq)) for seq in SeqIO.parse("seq_for_analysis/P_whitmer_BGD.fasta", "fasta")]

# Next, encode every sequence in `P_whitmer` as a binary variable representing reference (0) or alternative (1) allele

In [None]:
# Mapping to use for one-hot encoding. No nucleotide = missing because already checked that there are no indels in these seque
BASE_TO_COLUMN = {'A': 0, 'C': 1, 'T': 2, 'G': 3, 'N': 4}
COLUMN_TO_BASE = {0: "A", 1: "C", 2: "T", 3: "G"}

# Get one hot vector
def get_one_hot(sequence):

    if not pd.isnull(sequence):
        seq_in_index = np.array([BASE_TO_COLUMN.get(b, b) for b in sequence])

        one_hot = np.zeros((len(sequence), 4))

        # only get the non gap sites because otherwise we get indexing errors
        ind_non_gap = np.where(seq_in_index != 4)[0]

        # Assign the found positions to 1
        one_hot[ind_non_gap, seq_in_index[ind_non_gap]] = 1

        return one_hot
    else:
        return np.nan

# metadata file previously generated for all sequences
metadata = pd.read_csv("metadata_all.csv")