In [1]:
import re
import pandas as pd

In [2]:
## Generates an array for input to the NMC clustering algorithm

In [3]:
proteinlength = 510

In [4]:
# Dictionary for converting amino acids
aaconv = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M',
     'H1S': 'H', 'H2S': 'H'}

In [5]:
# Read in mutation database file
mutationfile = pd.read_excel("../Mutations_database/FH_mutations_DB.xlsx")

In [6]:
# Remove known benign mutations
mutationfile = mutationfile[mutationfile["Clinical Class"] != "benign"]

In [7]:
#Get only mutations in the database
mutationlist_only = pd.DataFrame(mutationfile["Protein Change"])
# Split protein change column
mutationlist_only['A'], mutationlist_only['Mutation'] = mutationlist_only['Protein Change'].str.split('p.', 1).str
# Get only known mutations
mutationlist_only = mutationlist_only[mutationlist_only["Mutation"] != "?"]
mutationlist_only = mutationlist_only[mutationlist_only["Mutation"] != "(=)"]
# Split mutation column
mutationlist_only["Mutation"] = mutationlist_only.apply(lambda x: x["Mutation"][1:-1], axis =1)
# Drop all non SNPs
mutationlist_only = mutationlist_only[~mutationlist_only.Mutation.str.contains("\*")]
mutationlist_only = mutationlist_only[~mutationlist_only.Mutation.str.contains("dup")]
mutationlist_only = mutationlist_only[~mutationlist_only.Mutation.str.contains("del")]
# Split into AA from, AA to, and residue mutated
mutationlist_only["AAfrom"] = mutationlist_only.apply(lambda x: x["Mutation"][:3], axis =1)
mutationlist_only["AAto"] = mutationlist_only.apply(lambda x: x["Mutation"][-3:], axis =1)
mutationlist_only["Residue"] = mutationlist_only.apply(lambda x: int(x["Mutation"][3:-3]), axis =1)
# Get only the mutations with have the structure for
mutationlist_only = mutationlist_only[mutationlist_only["Residue"] >= 49]
# Convert to upper case
mutationlist_only["AAfrom"] = mutationlist_only["AAfrom"].apply(lambda x: aaconv.get((x.upper())))
mutationlist_only["AAto"] = mutationlist_only["AAto"].apply(lambda x: aaconv.get((x.upper())))
# Generate mutation in more standard format
mutationlist_only["Mutation"] = mutationlist_only.apply(lambda x: "%s%s%s" % (x["AAfrom"], (x["Residue"]), x["AAto"] ), axis =1)

# Generate a list of all mutations
mutationlist_original = []
for item in mutationlist_only["Mutation"]:
    mutationlist_original.append(item)
mutationlist = mutationlist_original

In [8]:
mutationlist_only.head()

Unnamed: 0,Protein Change,A,Mutation,AAfrom,AAto,Residue
17,p.(Arg51Gln),,R51Q,R,Q,51
18,p.(Glu53Lys),,E53K,E,K,53
22,p.(Gly69Val),,G69V,G,V,69
43,p.(Arg101Pro),,R101P,R,P,101
47,p.(Asn107Thr),,N107T,N,T,107


In [9]:
# Extract residues from the mutation list
mutation_residue_list = []
for item in mutationlist:
        mutantnumber = int(re.findall('\d+', item)[0])
        mutation_residue_list.append(mutantnumber)

In [10]:
# Construct a list of lists containing a format for NMC clustering
# NMC needs:     12345678910
#                0000010000
#                0000100000
#                0000010000
#     One mutation   ^^  Two mutations etc...
listsforarray = []
listsforarray.append(list(range(49, proteinlength+1)))
for item in mutation_residue_list:
    zerolist = [0] * proteinlength
    zerolist[item+1] = 1
    listsforarray.append(zerolist)

In [11]:
# Turn into dataframe
mutationdataframe = pd.DataFrame(listsforarray)
# Set row1 as the column names
mutationdataframe.columns = mutationdataframe.iloc[0]
# Reset the index
mutationdataframe = mutationdataframe.reindex(mutationdataframe.index.drop(0))

In [12]:
# Save the dataframe
mutationdataframe.to_csv("FH_mutations_NMC_array.csv", index=False)