In [1]:
import numpy as np
import pandas as pd
from Bio import Seq, SeqIO, AlignIO, pairwise2, Entrez
from Bio.pairwise2 import format_alignment
import re

In [2]:
# aln = AlignIO.read("seq_for_analysis/W_aa_aln.fasta", "fasta")
# AlignIO.write(aln, "seq_for_analysis/W_aa_aln.phy", "phylip")

# Aligned all V and W sequences using Clustal Omega and dropped sequences containing ambiguous AAs
# Look for AA differences between clades

In [161]:
W_aa = pd.DataFrame([(seq.id.split(".")[0], str(seq.seq)) for seq in SeqIO.parse("seq_for_analysis/W_aln.fasta", "fasta")]).rename(columns={0:"Isolate", 1:"Seq"})
V_aa = pd.DataFrame([(seq.id.split(".")[0], str(seq.seq)) for seq in SeqIO.parse("seq_for_analysis/V_aln.fasta", "fasta")]).rename(columns={0:"Isolate", 1:"Seq"})
print(W_aa.shape, V_aa.shape)

# drop all sequences with gap characters in W because these are due to incomplete sequencing, not true gaps
#W_aa = W_aa.loc[~W_aa["Seq"].str.contains("-")]
#V_aa = V_aa.loc[~V_aa["Seq"].str.contains("-")]
print(W_aa.shape, V_aa.shape)

(29, 2) (28, 2)
(29, 2) (28, 2)


In [150]:
def find_unique_sites(aa_df):
    
    aa_df = pd.concat([aa_df["Isolate"], aa_df["Seq"].str.split("", expand=True).iloc[:, 1:-1]], axis=1)
    aa_df[["Isolate", "Country"]] = aa_df["Isolate"].str.split("/", expand=True)
    aa_df = aa_df.set_index("Isolate")
    aa_df["Clade"] = ["M" if row["Country"] == "Malaysia" else "B" for _, row in aa_df.iterrows()]
    
    for col in aa_df.columns:
        if col != "Clade":
            
            if "-" in aa_df[col].values:
                del aa_df[col]
            else:
                if len(aa_df[col].unique()) == 1:
                    del aa_df[col]
                else:
                    single_df = pd.DataFrame(aa_df.groupby("Clade")[col].value_counts()).rename(columns={col:"Count"}).reset_index()
                    if len(set(single_df.query("Clade == 'B'")[col].values).intersection(single_df.query("Clade == 'M'")[col].values)) != 0:
                        del aa_df[col]

    print(aa_df.shape)
    return aa_df

In [172]:
W_aa_sites = find_unique_sites(W_aa)
V_aa_sites = find_unique_sites(V_aa)

(29, 45)
(28, 48)


In [62]:
# alignments = pairwise2.align.globalxx(V_aa.Seq.values[0], W_aa.Seq.values[0])
# print(format_alignment(*alignments[0]))

In [63]:
def get_ncbi_accessions(id_list, database):
    
    Entrez.email='skulkarni@g.harvard.edu'

    # search Genbank, returns accession numbers
    handle=Entrez.esearch(db=database, retmax=1000, term=",".join(id_list), idtype="acc") 
    record = Entrez.read(handle)
    
    handle.close()
    fetch = Entrez.efetch(db=database, id=",".join(record['IdList']), rettype='gb', retmode='text')
    gb=fetch.read()
    
    # the first one is an empty string because it's what comes before the first locus
    found_seq = list(gb.split("LOCUS"))[1:]
    print(f"Found {len(found_seq)} out of {len(id_list)} NCBI accessions!")
    
    # remove the sequences becuase they make the strings unnecessarily long
    found_seq = [isolate.split("/gene")[0] for isolate in found_seq]
    
    return found_seq

In [77]:
countries_dict = dict(zip(['India', 'Kerala', 'Malaysia', 'Perak', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Kuala Lumpur'], 
                          ['India', 'India', 'Malaysia', 'Malaysia', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Malaysia']))

def extract_metadata(metadata_df):
        
    ncbi_info = get_ncbi_accessions(metadata_df["Isolate"].values, "protein")
        
    for i, row in metadata_df.iterrows():

        # clean so that every word is separated by a single whitespace, make lowercase, then split to get a list
        cleaned_info = re.sub('\s+',' ', ncbi_info[i]).lower().split(" ")

        # get the country by iterating through the list of countries
        for country in list(countries_dict.keys()):
            if country in ncbi_info[i]:
                metadata_df.loc[i, "Country"] = countries_dict[country]
                break
                    
    metadata_df.loc[metadata_df["Isolate"] == "YP_007188592", "Country"] = "Malaysia"
    metadata_df.loc[metadata_df["Isolate"] == "NP_112023", "Country"] = "Malaysia"
    
    metadata_df.loc[metadata_df["Isolate"] == "AEQ38076", "Country"] = "HeV"
    metadata_df.loc[metadata_df["Isolate"] == "AEQ38030", "Country"] = "HeV"
    
    # check that there are no more NaNs in the columns we just imputed
    #assert sum(pd.isnull(metadata_df['Country'])) == 0
    
    return metadata_df

In [78]:
W_with_metadata = extract_metadata(W_aa)
V_with_metadata = extract_metadata(V_aa)

Found 31 out of 31 NCBI accessions!
Found 30 out of 30 NCBI accessions!


In [83]:
V_with_metadata.Country.value_counts()

Bangladesh    18
India          8
Malaysia       2
Thailand       1
HeV            1
Name: Country, dtype: int64

In [84]:
W_with_metadata.Country.value_counts()

Bangladesh    20
India          7
Malaysia       2
Thailand       1
HeV            1
Name: Country, dtype: int64

In [88]:
V_with_metadata.loc[~V_with_metadata["Seq"].str.contains("|".join(["X", "B", "Z", "J"]))]
W_with_metadata.loc[~W_with_metadata["Seq"].str.contains("|".join(["X", "B", "Z", "J"]))]

with open("sequences/genes/W_aa.fasta", "w+") as file:
    for _, row in W_with_metadata.iterrows():
        file.write(">" + row["Isolate"] + "/" + row["Country"] + "\n")
        file.write(row["Seq"] + "\n")
        
with open("sequences/genes/V_aa.fasta", "w+") as file:
    for _, row in V_with_metadata.iterrows():
        file.write(">" + row["Isolate"] + "/" + row["Country"] + "\n")
        file.write(row["Seq"] + "\n")

In [2]:
P_unique_HeV = [(seq.id, str(seq.seq)) for seq in SeqIO.parse("seq_for_analysis/P_unique_HeV.fasta", "fasta")]
P_df = pd.DataFrame(P_unique_HeV)
P_df.columns = ["ID", "Seq"]

# remove HeV
P_df = P_df.query("ID != 'JN255817'").reset_index(drop=True)

lengths = [len(seq) for seq in P_df.Seq.values]
print(np.unique(lengths))

metadata = pd.read_csv("metadata.csv")
P_df = P_df.merge(metadata, on="ID", how="inner")

P_df["AA"] = [Seq.translate(seq) for seq in P_df.Seq.values]

[2130]


In [10]:
P_df_AA = P_df["AA"].str.split("", expand=True)
P_df_AA = P_df_AA.iloc[:, 1:-1]
P_df_AA = P_df_AA.set_index(P_df.ID.values)

P_df_AA = P_df_AA.merge(P_df[["ID", "Clade_grouped"]], left_index=True, right_on="ID")

# get all AA differences
for col in P_df_AA.columns:
    if len(P_df_AA[col].unique()) == 1:
        del P_df_AA[col]
        
print(f"{P_df_AA.shape[1]-2} amino acids have variation")

# next, get all AA sites that perfectly discriminate between clades
for col in P_df_AA.columns:
    if col not in ["ID", "Clade_grouped"]:
        single_df = pd.DataFrame(P_df_AA.groupby("Clade_grouped")[col].value_counts()).rename(columns={col: "count"}).reset_index()

        for aa in single_df[col].unique():
               if len(single_df.loc[single_df[col] == aa].Clade_grouped.unique()) > 1:
                    del P_df_AA[col]
                    break
        
print(f"{P_df_AA.shape[1]-2} amino acids discriminate clades")

128 amino acids have variation
44 amino acids discriminate clades
