In [72]:
# importing the requests library 
import requests 
import pandas as pd
from random import sample 
import matplotlib
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns                                                             
import sys
import scipy
import nltk
from nltk.stem import WordNetLemmatizer 
from spellchecker import SpellChecker
from numpy import linalg as LA

After manually classifying 150 function I noticed that most common mistake are: word ordering and word mispelling. Another thing I noticed where that each miss written function ussually was in a pham with the correct approved function name. With these two realizations I want to make a spell checker system that takes into account common error and genetic makeup.

#### Steps to Correct to Building Bag of Words Vectorized Database
1. Make a corpus of words found in approved functions
2. Token and lemminize that corpus
3. Make a vector representation of each approved function

#### Steps to Correct Non-Approved Function Names
1. Pre-proccessing: Delete words that are non-functional, ie "punative", "protien", "domain"
2. Pre-proccessing: Tokenize each word
3. Pre-proccessing: Convert acroymns to full words based on known acronyms
4. Pre-proccessing: Identify known unapproved functions (from SEA-PHAGES list) and covert to known functions
5. Pre-proccessing: Spell correct to attain words found in function corpus
6. Use bag of words technique to create a input vector
7. Use nearest neighbors of approved functions in thier PHAM to determine approved function


# Building Bag of Words Vectorized Database


### 1. Make a corpus of words found in approved functions

In [91]:
df_approved_functions = pd.read_csv("Approved_Functions.csv")
df_approved_functions = df_approved_functions.dropna(subset=["Approved Function"])
df_approved_functions.head()

Unnamed: 0,Approved Function,Unapproved Functions
0,"terminase, small subunit",TerS
1,terminase,
2,"terminase, large subunit",TerL
3,"terminase, large subunit (ATPase domain)",
4,"terminase, large subunit (nuclease domain)",


In [143]:
approved_functions = list(df_approved_functions["Approved Function"])
approved_functions = [i.lower() for i in approved_functions]
approved_functions

['terminase, small subunit',
 'terminase',
 'terminase, large subunit',
 'terminase, large subunit (atpase domain)',
 'terminase, large subunit (nuclease domain)',
 'portal protein',
 'portal and muf-like fusion protein',
 'scaffolding protein',
 'capsid maturation protease',
 'major capsid protein',
 'minor capsid protein',
 'muf-like minor capsid protein',
 'capsid decoration protein, lamd-like',
 'capsid maturation protease and muf-like fusion protein',
 'major capsid and protease fusion protein',
 'head fiber protein',
 'head-to-tail adaptor',
 'head-to-tail stopper',
 'tail terminator',
 'major tail protein',
 'tail assembly chaperone',
 'tape measure protein',
 'minor tail protein',
 'minor tail protein, d-ala-d-ala carboxypeptidase',
 'tail sheath protein',
 'tail fiber',
 'tailspike protein',
 'tail needle protein',
 'baseplate j protein',
 'tail tube protein',
 'baseplate wedge protein',
 'capsid decoration protein',
 'lysin a',
 'lysin a, protease m15 domain',
 'lysin a, prot

### 2. Token and lemminize that corpus

In [93]:
non_unique_corpus = []
for function in approved_functions:
    non_unique_corpus += nltk.word_tokenize(function) 

unique_corpus = np.sort(np.unique(non_unique_corpus)) 
unique_corpus = [i.lower() for i in unique_corpus]
unique_corpus[1:10]

corpus = unique_corpus

In [94]:
len(corpus)

318

### 3. Make a vector representation of each approved function

In [95]:
function_vectors = {}
for func in approved_functions:
    function_vectors[func] = [0]*(len(corpus)+1)
    for token in nltk.word_tokenize(func):
        if token.lower() in corpus:
#             print(token, corpus[corpus.index(token.lower())])
            function_vectors[func][corpus.index(token.lower())] = 1

In [115]:
# FILTER BASED ON COUNT
COUNT_FILTER = [0]*(len(corpus)+1)
for c in corpus:
    COUNT_FILTER[corpus.index(c)] = 1/non_unique_corpus.count(c)

In [117]:
COUNT_FILTER[corpus.index("domain")]

0.058823529411764705

In [122]:
np.multiply([1,0,1],[1,1,3])

array([1, 0, 3])

### Create known conversion list

In [96]:
unapproved_functions = []
conversion_list = {}
for _, row in df_approved_functions.dropna(subset=["Approved Function", "Unapproved Functions"]).iterrows():
    for i in row["Unapproved Functions"].split(", "):
        conversion_list[i.lower()] = row["Approved Function"]
        unapproved_functions += [i.lower()]
        
conversion_list

{'ters': 'terminase, small subunit',
 'terl': 'terminase, large subunit',
 'head to tail connector': 'portal protein',
 'scaffold': 'scaffolding protein',
 'capsid': 'major capsid protein',
 'capsid morphogenesis protein': 'capsid maturation protease and MuF-like fusion protein',
 'head-to-tail connector': 'head-to-tail adaptor',
 'head-to-tail connector complex protein': 'head-to-tail adaptor',
 'major tail subunit': 'major tail protein',
 'tail scaffolding protein': 'tail assembly chaperone',
 'tape measure': 'tape measure protein',
 'tmp': 'tape measure protein',
 'tapemeasure': 'tape measure protein',
 'tail fiber-like protein': 'minor tail protein',
 'collagen-like': 'minor tail protein',
 'glycine rich': 'minor tail protein',
 'head decoration protein': 'capsid decoration protein',
 'lysa': 'lysin A',
 'endolysin a': 'lysin A',
 'lysb': 'lysin B',
 'endolysin b': 'lysin B',
 'repressor': 'immunity repressor',
 'excisionase': 'excise',
 'xis': 'excise',
 'rdf': 'recombination dire

In [97]:
synomym_list = {"head": "capsid",
                "connector": "adaptor", 
                "single-stranded dna": "ssdna",
                "hth": "helix-turn-helix",
                "cro": "cro (control of repressor’s operator)"
               }
for key, value in conversion_list.items():
    if len(nltk.word_tokenize(key)) == 1:
        synomym_list[key] = value
    
synomym_list

{'head': 'capsid',
 'connector': 'adaptor',
 'single-stranded dna': 'ssdna',
 'hth': 'helix-turn-helix',
 'ters': 'terminase, small subunit',
 'terl': 'terminase, large subunit',
 'scaffold': 'scaffolding protein',
 'capsid': 'major capsid protein',
 'tmp': 'tape measure protein',
 'tapemeasure': 'tape measure protein',
 'collagen-like': 'minor tail protein',
 'lysa': 'lysin A',
 'lysb': 'lysin B',
 'repressor': 'immunity repressor',
 'excisionase': 'excise',
 'xis': 'excise',
 'rdf': 'recombination directionality factor',
 'pnk': 'polynucleotide kinase',
 'lsr2': 'Lsr2-like DNA bridging protein',
 'ku': 'Ku-like dsDNA break-binding protein',
 'whib': 'WhiB family transcription factor',
 'thyx': 'ThyX-like thymidylate synthase',
 'dr': 'dihydrofolate reductase',
 'mazg': 'MazG-like nucleotide pyrophosphohydrolase',
 'erf': 'ssDNA binding protein, ERF family',
 'clp': 'ClpP-like protease',
 'nrdh': 'NrdH-like glutaredoxin',
 'ro': 'Ro-like RNA binding protein',
 'pura': 'adenylosuccinat

#### There are some components that are more identifiable than others

In [98]:
# fig, ax1 = plt.subplots(figsize = (20,6))
# plt.hist(func_units, bins = np.unique(func_units))
# plt.xticks([i if func_units.count(i) > 1 else "" for i in np.unique(func_units)], rotation='vertical')
# plt.show()

In [99]:
# ident_vocab = []
# common_vocab = []
# for i in np.unique(func_units):
#     if func_units.count(i) == 1:
#         ident_vocab += [i]
#     elif func_units.count(i) > 1:
#         common_vocab += [i]

In [100]:
import json

spell_dict = {}
for i in unique_corpus:
    spell_dict[i] = non_unique_corpus.count(i)
    
for i in synomym_list.keys():
    spell_dict[i] = 1
    
    
with open('functions.json', 'w') as fp:
    json.dump(spell_dict, fp)
    
spell_dict

{"'": 1,
 '(': 10,
 ')': 10,
 ',': 35,
 '1': 2,
 '2': 2,
 '5': 1,
 'a': 10,
 'aaa-atpase': 1,
 'abortive': 2,
 'acetyltransferase': 1,
 'acid': 1,
 'adaptor': 1,
 'adda-like': 1,
 'adenylate': 2,
 'adenylosuccinate': 2,
 'adenylyltransferase': 1,
 'adp-ribosyl': 1,
 'adp-ribosyltransferase': 3,
 'alpha': 1,
 'amidase': 1,
 'amidotransferase': 1,
 'amine': 1,
 'aminotransferase': 1,
 'and': 4,
 'antirepressor': 1,
 'antirestriction': 2,
 'antitoxin': 4,
 'arda-like': 1,
 'arsenate': 1,
 'asc-1': 1,
 'assembly': 1,
 'atp': 1,
 'atpase': 1,
 'b': 1,
 'band-7': 1,
 'baseplate': 2,
 'beta': 1,
 'binding': 13,
 'biosynthesis': 1,
 'break': 1,
 'break-binding': 2,
 'bridging': 1,
 'brnt-like': 1,
 'c39': 1,
 'capsid': 1,
 'carboxylate': 1,
 'carboxylesterase': 1,
 'carboxypeptidase': 1,
 'cas4': 1,
 'cassete-like': 1,
 'chain': 2,
 'chaperone': 1,
 'chaperonin': 1,
 'clamp': 1,
 'clpp-like': 1,
 'cluster': 1,
 'coactivator': 1,
 'cobalamin': 1,
 'cobt-like': 1,
 'control': 1,
 'cro': 1,
 'cyc

In [101]:
# https://readthedocs.org/projects/pyspellchecker/downloads/pdf/latest/

Examples of errors:

1. "helix-turn-helix dna binding domain protein" this happens when unimportant words are added
"helix-turn-helix dna binding domain protein"

2. "terminase large subunit" happens when punctuation is forgotten

In [161]:
spell = SpellChecker(local_dictionary="functions.json", distance=3)
def find_approved_function(function, pham_members):
    print("input:",function)
    if function in conversion_list:
        function = conversion_list[function]
    # clean function
    #     function = "baseplate J domain faseto protein"
    cleaned_function = []
    # tokens = []
    for i in nltk.word_tokenize(function): #reca
        if spell.word_probability(i.lower()+"-like", 1) > spell.word_probability(i.lower(), 1):
            corrected_token = spell.correction(i.lower()+"-like")
        else:
            corrected_token = spell.correction(i.lower())
            
        if corrected_token in synomym_list:  
            corrected_token = synomym_list[corrected_token]

        corrected_token = nltk.word_tokenize(corrected_token)
            
        cleaned_function += corrected_token

    print("cleaned function:", cleaned_function)

    # create bag of words vector
    vec = [0]*(len(corpus)+1)
    for i in cleaned_function:
        if i in corpus:
            vec[corpus.index(i)] = 1
        else:
            vec[-1] = 1

    # find nearest neighbor
    min_value = float('inf')
    min_key = ""
    for key in pham_members:
        func_vec = function_vectors[key]
        dist = LA.norm(np.multiply(COUNT_FILTER,func_vec)-np.multiply(COUNT_FILTER,vec))
        if dist < min_value:
            min_value = dist
            min_key = key
            
    print("conversion:", min_key, min_value) 
    return min_key

In [140]:
df_genes = pd.read_csv("all_phage_genes.csv")
df_genes = df_genes.applymap(str)
df_genes["GeneNumber"] = df_genes["GeneNumber"].apply(pd.to_numeric)
df_genes["Function"] = df_genes["Function"].str.lower()
df_genes = df_genes.applymap(str)
df_genes.describe(include = 'all')
df_genes.head()

Unnamed: 0,GeneID,HostStrain,Cluster,Pham,Function,Translation,Orientation,PhageName,GeneNumber
0,20ES_CDS_1,Mycobacterium,A2,36676,,MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPEN...,F,20ES,1
1,20ES_CDS_10,Mycobacterium,A2,34452,lysin b,MSLQVGSSGELVNRWIRVMKARFASYAGKLKEDGYFGLDDKAVQQE...,F,20ES,10
2,20ES_CDS_11,Mycobacterium,A2,34196,terminase,MSLENHHPELAPSPPHIIGPSWQRTVDGSWHLPDPKMTLGWGVLKW...,F,20ES,11
3,20ES_CDS_12,Mycobacterium,A2,37970,portal protein,MTAPLPGQEEIPDPAIARDEMISAFDDAVKNLKINTSYYEAERRPE...,F,20ES,12
4,20ES_CDS_13,Mycobacterium,A2,21454,capsid maturation protease,MITAAVAAYVQRFASMFTGPALSLGEWARFLQTLFPEVQRRYAQAA...,F,20ES,13


In [134]:
non_corresponding = []
for index, row in df_genes.iterrows():
    function = row["Function"]
    if function not in approved_functions and function != "nan":
        print(row["Function"])
        non_corresponding.append(row["Function"])

scaffold protein
major capsid subunit
major tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
para
parb
terminase small subunit
thyx-like protein
endovii
esterase/lipase
hth binding protein
dnab-like helicase
recb-like protein
small terminase subunit
capsid protein
virion protein
major tail subunit
tapemeasure protein
virion protein
minor tail subunit
virion protein
large terminase subunit
virion protein
integrase
repressor
hth binding protein
dnaq
rece like protein
rect
putative hth binding protein
ruvc
whib
parb-like protein
capsid protein
major tail subunit
queuine-trna ribosyltransferase
tapemeasure protein
minor tail subunit
minor tail subunit
minor tail subunit
terminase small subunit
hth binding protein
terminase large subunit
putative holin
type iii restriction helicase
primase/helicase
ruvc
pe/ppe-like protein
terminase large subunit
scaffold protein
capsid protein
tail protein
minor tail subunit
minor tail subunit
para
hth dna binding protein
terminase sma

helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
dna recombinase
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
integrase
rele-like toxin
relb-like antitoxin
cro protein
helix-turn-helix dna binding domain protein
mpme 1 protein
merr-like hth dna binding protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
dna polymerase iii subunit
putative head decoration, structural
putative tail sheath protein
tapemeasure protein
putative minor tail protein
putative minor tail protein
putative baseplate j-like protein
putative minor tail protein
peptidyl-trna hydrolase
d-ala-d-ala carboxypeptidase
a

integrase
hth dna binding protein
dcmp deaminase
thyx
hth dna binding protein
esterase/lipase
dnab-like helicase
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
mazg-like nucleotide pyrophosphate
nucleotidyltransferase
exonuclease vii
mre11 double-strand break repair endo/exonuclease
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
cro protein
hth dna binding protein
hth dna binding domain protein
whib
dna polymerase iii subunit
terminase small subunit
terminase large subunit
resolvase
dna binding domain protein
hicb-like antitoxin
hica-like toxin
endonuclease
dsdna helicase
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosu

terminase small subunit
lysm-like endolysin
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
head-to-tail connector complex protein
dna recombinase
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
parb-like nuclease domain protein
ribbon-helix-helix dna binding protein
tapemeasure protein
peptidyl-trna hydrolase
dsdna helicase
dnaj-like chaperonin
dna polymerase iii alpha subunit
dna recombinase
resolvase
nicotinamide ribosyltransferase
homing endonuclease domain protein
pura-like adenylosuccinate synthetase
nucleotidyltransferase
band-7-like membrane protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm-like endolysin
head-to-tail connector complex protein
integrase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna

recb-like exonuclease
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
paps reductase-like protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
capsid morphogenesis protein
helix-turn-helix dna binding domain protein
dnaj-like chaperonin
asc-1 transcription coactivator
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
parb-like nuclease domain protein
ribbon-helix-helix dna binding protein
hica-like toxin
dna recombinase
parb-like nuclease domain protein
queuine-trna ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
quee-like queosine biosynthesis protein
ribbon-helix-helix dna binding domain protein
hica-like toxin
dna recombinase
d-ala-d-ala carboxypeptidase
dnaj-like chaperonin
rusa-l

terminase small subunit
terminase large subunit
integrase
integrase
helix-turn-helix dna binding protein
dna binding domain protein
helix-turn-helix dna binding protein
ssdna binding domain protein
dna binding domain protein
helix-turn-helix dna binding protein
terminase large subunit
head-to-tail connector
terminase small subunit
integrase
helix-turn-helix dna-binding domain protein
dsdna helicase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
dnaj-like chaperonin
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
merr-like hth dna binding protein
dna polymerase iii subunit
terminase small subunit
terminase large subunit
rusa-like resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hicb-like antitoxin
hica-like toxin
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-h

helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
terminase small subunit
head-to-tail connector protein
head-to-tail connector protein
major tail subunit
terminase large subunit
integrase
hth dna binding protein
hth dna binding domain protein
dna polymerase iii beta subunit
dna polymerase iii beta subunit
hth dna binding domain protein
resolvase
hth dna binding domain protein
head-to-tail connector protein
head-to-tail connector protein
helix-turn-helix dna binding protein
terminase large subunit
helix-turn-helix dna binding protein
n-acetyltransferase
rna binding domain protein
dnaj-like chaperonin
dna polymerase iii
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
lysm-domain protein
terminase large subunit
terminase small subunit
helix-turn-helix dna-bindin

dnaj-like chaperonin
rusa-like resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna-binding domain protein
ribbon-helix-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
endonuclease
tapemeasure
baseplate j
d-ala-d-ala carboxypeptidase
aaa-atpase (dnac-like)
dnab-like helicase
dnaj-like domain
dna polymerase iii, alpha
rf1 protein
histidine triad domain protein
reca-like protein
rusa-like protein
ro protein
glycosyl transferase
galactosyl transferase
galactosyl transferase
galactosyl transferase
glycosyl transferase
ser/thr kinase
pura-like protein
band 7 protein
nucleotidyltransferase
hth dna binding protein
parb-like domain protein
thyx-like domain protein
lysm domain protein
major capsid
rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-

rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
putative tail assembly/metal protease
putative head protein
putative major tail subunit
tapemeasure protein
asp-rich protein
nucleotidase
fabg/l-xylulose reductase
dna polymerase iii alpha subunit
thyx
vip2 protein
primase/polymerase
terminase large subunit
rect-like ssdna binding protein
terminase large subunit
terminase small subunit
merr-like helix-turn-helix dna binding domain protein
terminase small subunit
merr-like helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
terminase large subunit
head-to-tail connector protein
integrase
excisionase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
dsdna helicase
putative transcriptional regulator
major tail subunit
tapemeasure
terminase small subunit
d-ala-d-ala carboxypeptidase
terminase large subunit

head-to-tail connector protein
head-to-tail connector protein
head-to-tail connector protein
integrase
helix-turn-helix dna-binding domain protein
integrase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
dsdna helicase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
parb-like nuclease domain protein
polypeptide-n-acetylgalactosaminyltransferase
major tail subunit
tapemeasure
terminase small subunit
terminase large subunit
dna-q like exonuclease
integrase
cro protein
dna damage inducible protein d-like protein
hth domain dna binding protein
xis
whib
hth dna binding protein
hth dna binding protein
whib
reca
dna helicase/methylase
giy-yig endonuclease
terminase large subunit
helix-turn-helix dna-binding protein
endolysin, l-ala-d-glu peptidase domain
endolysin, n-acetylmuramoyl-l-alanine amidase domain
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna bin

exonuclease vii
mre11 double-strand break repair endo/exonuclease
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
mazg-like nucleotide pyrophosphate
tail fiber protein
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
putative large subunit terminase
putative sgnh hydrolase
hnhc-like endonuclease
putative dna single-strand annealing protein
putative exonuclease
endonuclease
putative helicase
dna polymerase iii epsilon subunit
single stranded dna-binding protein
pset polynucleotide 5'-kinase and 3'-phosphatase
giy-yig endonuclease
metallophosphatase
holin/anti-holin
holin/anti-holin
tape measure
capsid protein
tail fiber protein
glycerophosphoryl diester phosphodiesterase
tail fiber protein
terminase large subunit
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
cro (control of repressor's operator)
dna helicase/dna methylase
terminase large 

helix-turn-helix dna binding domain protein
terminase large subunit
helix-turn-helix dna binding domain protein
major tail subunit
tapemeasure protein
minor tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
integrase
whib
dnaq
nrdh
rusa
rtcb
integrase
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
tapemeasure protein
helix-turn-helix dna binding domain protein
dna binding, hu-like domain protein
helix-turn-helix dna binding domain protein
hica-like toxin
capsid morphogenesis protein
nucleotidyltransferase
terminase large subunit
rusa-like resolvase
dnaq-like dna polymerase iii subunit
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
head-to-tail connector complex protein
integrase
helix-turn-helix dna binding protein
thymidylate synthase-like protein
nrdh glutaredoxin
helix

helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
nucleotidyltransferase
nucleotidyltransferase
lysm-like endolysin
dnae-like dna polymerase iii
head-to-tail connector protein
lysin
dsdna helicase
endonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
terminase large subunit
dnmp kinase
dna recombinase
terminase large subunit
tapemeasure protein
minor tail subunit
hth dna binding protein
putative sprt-like protein
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
dna polymerase iii subunit
merr-like hth dna binding protein
integrase
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like hth dna binding protein
major capsid subunit
tapemeasure protein
integrase
deoxycytidinylate deaminase
parb-like nuclease domain protein
terminase large subunit
whib/hnh endonuclease
tail scaffold protein
tail scaffold protein
ta

recb-like exonuclease
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
paps reductase
paps reductase
helix-turn-helix dna binding protein
terminase large subunit
helix-turn-helix dna binding protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
pre-tmp frameshift protein
truncated pre-tmp frameshift protein
putative holin
lysa
terl
whib
lysb
mu gp29-like protein
dnan
oligoribonuclease
tyrosine recombinase/integrase
vwfa
aaa protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
head-to-tail connector complex protein
helix-turn-helix dna binding domain protein
terminase small subunit
integrase
erf family ssdna binding protein
termi

terminase large subunit
integrase
helix-turn-helix dna-binding domain protein
clp protease
dnaq-like exonuclease
reca
hnh endonuclease domain protein
major tail subunit
tapemeasure protein
nrdh-like protein
terminase small subunit
integrase
pentapepetide repeat family protein
erf protein
hnh endonuclease domain protein
terminase large subunit
dnab-like helicase
pnk
whib
terminase small subunit
terminase large subunit
ribbon-helix-helix dna binding domain protein
brnt-like toxin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hth dna binding protein
hnh homing endonuclease
dnaq-like exonuclease
reca
hth dna binding protein
head-tail connector
major tail subunit
tapemeasure
d-ala-d-ala carboxypeptidase
hth dna binding protein
nrdh-redox

dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
erf family ssdna binding protein
terminase large subunit
dnab-like dna helicase
helix-turn-helix dna binding domain protein
endolysin, l-ala-d-glu peptidase domain
endolysin, n-acetylmuramoyl-l-alanine amidase domain
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
queuine trna-ribosyltransferase
cro protein
helix-turn-helix dna binding protein
dna polymerase iii exonuclease
rusa-like resolvase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
clp protease
dnaq-like exonuclease
reca
parbc domain protein
major tail subunit
tapemeasure protein
minor tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
d-ala-d-ala carboxypeptidase
nrdh
ls

lysm domain protein
terminase small subunit
major tail subunit
tapemeasure
d-ala-d-ala carboxypeptidase
terminase large subunit
dnaq-like exonuclease
capsid maturation protein
integrase
hth dna binding domain protein
whib
hth dna binding protein
aaa atpase
dna methyltransferase
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
terminase small subunit
terminase large subunit
erf family ssdna binding protein
dna polymerase iii subunit
membrane-like protein
pe-pgrs family protein
nlp/p60-family domain protein
baseplate j
peptidyl-trna hydrolase
argd
dnac
dnab-like helicase
primase
j domain protein
dna polymerase iii alpha
rf1 domain protein
reca
rusa
erpb-like protein
ro protein
pnk
ser/thr kinase
pura
nrdc
hth dna binding domain protein
l13e family protein
tat
parbc domain protein
thyx
lysm domain
lysm domain
head decoration protein-like protein
scaffold protein
tail chaperone
tapemeasure
minor tail subunit
para


helix-turn-helix dna-binding domain protein
terminase large subunit
helix-turn-helix dna binding domain protein
polypeptide n-acetylgalactosaminyltransferase
phosphoribosyltransferase
helix-turn-helix dna binding protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna methylase domain protein
helix-turn-helix dna binding protein
dna methyltransferase
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
queuine trna-ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
quee-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
tapemeasure protein
resolvase
helicase loader
dsdna helicase
dna polymerase iii alpha
dna recombinase
rna polymease sigma factor
glycosyl hydrolase
terminase small subunit
hth d

helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
endonuclease
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
terminase large subunit
relb-like antitoxin
rele-like toxin
hicb-like antitoxin
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
hica-like toxin
tail protein
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
terminase small subunit
d-ala-d-ala carboxypeptidase
phosphoestertase
endonuclease viii
dsdna helicase
major tail subunit
integrase
terminase large subunit
dna binding domain protein
rece
rect
whi

helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dnaq-like exonuclease
integrase
cro protein
helix-turn-helix dna binding domain protein
dna helicase/methylase
mpme1 protein
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dnae-like dna polymerase iii
vip2-like toxin
helix-turn-helix dna binding domain protein
zinc-finger binding domain protein
hnh endonuclease domain protein
tapemeasure
tail lysozyme
baseplate j
hnh endonuclease domain protein
d-alanyl-d-alanine carbooxypeptidase
argd
dnac-like protein
dnab-like helicase
dnaj domain protein
dna polymerase iii alpha subunit
rf1 protein
reca-like protein
hnh endonuclease domain protein
rusa-like protein
histidine triad domain protein
ro
glycosy

dsdna helicase
parb-like nuclease domain protein
hnh endonuclease domain protein
merr-like hth dna binding protein
dnae-like dna polymerase iii
terminase small subunit
tapemeasure protein
terminase large subunit
integrase
rect-like ssdna binding protein
rusa-like resolvase
band-7 like protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
merr-like hth dna binding protein
ribbon-helix-helix dna binding domain protein
terminase large subunit
tapemeasure protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
terminase large subunit
integrase
terminase small subunit
helix-turn-helix dna binding domain protein
dsdna helicase
helix-turn-helix dna binding domain protein


atp binding domain protein
dna binding domain protein
trigger factor
parb-like nuclease domain protein
helix-turn-helix dna binding protein
head-to-tail connector complex protein
head-to-tail connector complex protein
head-to-tail connector complex protein
hth dna binding protein
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
deoxynucleotide monophosphate kinase
terminase large subunit
dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
terminase small subunit
integrase
erf family ssdna binding protein
terminase large subunit
helix-turn-helix dna binding domain protein
recb-like exonuclease
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
terminase small subunit
helix-turn-helix dna binding protein
terminase small subunit

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
recb-like exonuclease
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
queuine-trna ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
quee-like queosine biosynthesis protein
ribbon-helix-helix dna binding domain protein
hica-like toxin
scaffold protein
head-to-tail connector protein
tail assembly chaperone protein
dnab-like helicase
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding protein
imm-like protein
helix-turn-helix dna binding domain protein
integrase
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna bindi

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
head-to-tail connector protein
tail protein
integrase
dna binding domain protein
cytidine deaminase
hth dna binding domain protein
esterase/lipase
phosphoribosyltransferase
dnab-like helicase
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
erf family ssdna binding protein
terminase large subunit
helix-turn-helix dna binding protein
terminase 

cmp deaminase
riia-like protector protein
riib-like protein
lsr2-like protein
nucleotidyltransferase
pas34-like protein
putative restriction endonuclease
putative endolysin
tapemeasure
dna polymerase iii
mazg
reca-like recombinase
lsr2-like protein
exonuclease/helicase
terminase large subunit
dna polymerase iii sliding clamp
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
rusa-like resolvase
terminase small subunit
terminase large subunit
rusa-like resolvase
erf family ssdna binding protein
parb-like nuclease domain protein
terminase small subunit
integrase
hth dna binding protein
hth dna binding protein
purine phosphoribosyltransferase
dsdna helicase
helix-turn-helix dna binding domain protein
tapemeasure protein
integrase
nrdh
dnab-like helicase
integrase
terminase large subunit
dna polymerase iii subunit
abc transporter atp-binding protein
capsid and capsid maturation protease
small terminase subunit
terminase large subunit
integrase
rece
rect
ftsk
rusa
h

erf family ssdna binding protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
erf family ssdna binding protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
recb-like exonuclease
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
paps reductase-like protein
helix-turn-helix dna binding protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna bindi

integrase
helix-turn-helix dna binding domain protein
dna-directed rna polymerase subunit alpha
ribonuclease reductase
helix-turn-helix dna binding protein
sprt-like protein
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
dna polymerase iii subunit
integrase
cro protein
hth dna binding protein
hth dna binding protein
dna recombinase
dna helicase/methylase
hth dna binding protein
dna cytosine methylase
dna cytosine methylase
nucleotidyltransferase
terminase large subunit
erf family ssdna binding protein
parb-like nuclease domain protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
tapemeasure protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
queuine trna-ribosyltran

endolysin, l-ala-d-glu peptidase domain
endolysin, n-acetylmuramoyl-l-alanine amidase domain
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
helix-turn-helix dna binding domain protein
portal
scaffolding
major capsid
tapemeasure
minor tail subunit
integrase (y-int)
hth dna binding
thyx-like
nrdh-like
endo vii
minor tail subunit
dnab-like helicase
recb-like
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
terminase large subunit
head-to-tail connector protein
head-to-tail connector protein
head-to-tail connector protein
dna bridging protein
para dsdna partitioning protein
parb dsdna partitioning protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
rusa-like resolvase
dna poly

tapemeasure protein
helix-turn-helix dna binding protein
terminase large subunit
arda-like antirestriction protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
lysm-like endolysin
helix-turn-helix dna binding domain protein
terminase small subunit
dnae-like dna polymerase iii
terminase large subunit
terminase small subunit
nucleotidyltransferase
terminase large subunit
cro protein
band-7-like membrane protein
erf family ssdna binding protein
rusa-like resolvase
hnh domain protein
terminase large subunit
scaffold protein
terminase small subunit
tapemeasure protein
integrase
cytidine deaminiase
dna polymerase iii
hth domain protein
rdf protein
dnab-like helicase
repressor
dna methyltransferase
sprt
terminase large subunit
helix-turn-helix dna binding domain protein
acetyltransferase domain protein
terminase small subunit
terminase large subunit
terminase small subunit
major t

baseplate i protein
peptidyl-trna hydrolase
d-ala d-ala carboxypeptidase
helicase loader
dnaj-like chaperonin
dna polymerase iii alpha subunit
histidine triad protein
reca ldna recombinase
homing endonuclease
endonuclease
rna polymerase
ro rna binding protein
head decoration protein
pura-like adenylosuccinate synthetase
hth dna binding domain protein
lysm-like endolysin
zinc-finger dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
queuine-trna ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
quee-like queosine biosynthesis protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dnaq-like exonuclease
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
h

terminase small subunit
nucleotidyltransferase
cro protein
band-7-like membrane protein
terminase large subunit
erf family ssdna binding protein
parb-like nuclease domain protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
rect-like ssdna binding protein
helix-turn-helix dna-binding domain protein
rusa-like resolvase
helix-turn-helix dna-binding domain protein
endonuclease
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
cro protein
helix-turn-helix dna binding protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain 

helix-turn-helix dna binding domain protein
tapemeasure protein
d-ala-d-ala carboxypeptidase
integrase
endovii
dnab-like helicase
terminase small subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
queuine trna-ribosyltransferase
integrase
cro protein
helix-turn-helix dna binding domain protein
whib family transcription regulator
dna polymerase iii subunit
hica-like toxin in toxin/antitoxin system
resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
endonuclease
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
terminase small subunit
h

tapemeasure
minor tail subunit
minor tail subunit
integrase
hth dna binding domain
endovii
phosphoribosyltransferase
dnab-like helicase
methylase domain protein
sprt
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
peptidyl-trna hydrolase
d-ala d-ala carboxypeptidase
dnaj-like chaperonin
dnae-like dna polymerase iii alpha
resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
lysm-like endolysin
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
terminase large subunit
queuine trna-ribosyltransferase
queuine trna-ribosyltransferase
quec-like queosine biosynthesis protein
rusa-like resolvase
erf family ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
qued-like queosine biosynthesis protein
quee-like queosine biosynthesis protein
queuine-tr

dna polymerase iii subunit
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dnaq-like dna polymerase iii subunit
terminase small subunit
nucleotidyltransferase
terminase large subunit
cro protein
erf family ssdna binding protein
rusa-like resolvase
band-7-like membrane protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
dnae-like dna polymerase iii
terminase small subunit
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
cro protein
helix-turn-helix dna binding domain protein
mpme 1 protein
trna synthetase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosucc

lysa
hth domain protein
recb
p-loop motif protein
zinc binding cmp, dcmp deaminase
dutpase
terminase large subunit
reca
hnh endonuclease domain protein
hth domain protein
capsid
helix-turn-helix dna binding domain protein
tapemeasure protein
helix-turn-helix dna binding domain protein
dna repair protein
rect-like ssdna binding protein
rusa-like resolvase
helix-turn-helix dna binding protein
hica-like antitoxin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
queuine trna-ribosyltransferase
parb-like nuclease domain protein
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
queuine trna-ribosyltransferase
integrase
cro protein
dna polymerase iii subunit
resolvase
helix-turn-helix dna binding domain protein
sprt protease
atp binding cassette-like protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-h

hnh endonuclease domain protein
peptidyl-trna hydrolase
d-ala-d-ala carboxypeptidase
argd
dnac-like protein
dnab-like helicase
dnaj domain protein
dna polymerase iii alpha chain-like protein
peptide release factor 1
reca-like protein
rusa-like protein
hnh endonuclease domain protein
histidine triad domain protein
galactosyl transferase
pura-like protein
nrdc-like protein
band 7 protein
hth dna binding domain protein
parb-like nuclease domain protein
hnh endonuclease domain protein
thyx-like protein
hnh endonuclease domain protein
lysm domain protein
zinc-finger dna binding domain protein
dnae-like dna polymerase iii
dnae-like dna polymerase iii
terminase large subunit
capsid maturation protein
head-to-tail connector protein
head-to-tail connector protein
head-to-tail connector protein
integrase
dna binding domain protein
esterase/lipase
helix-turn-helix dna binding protein
dnab-like helicase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
lamd-li

head-to-tail connector protein
integrase
dsdna helicase
helix-turn-helix dna binding domain protein
superinfection immunity protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
tapemeasure protein
integrase
helix-turn-helix dna-binding domain protein
putative immunity repressor
terminase large subunit
hica-like toxin
dna-directed rna polymerase subunit alpha
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
n-acetyltransferase
d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding 

queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
head-to-tail connector protein
dsdna helicase
helix-turn-helix dna binding domain protein
dna bridging protein
integrase
dna polymerase iii alpha
terminase large subunit
terminase large subunit
head-to-tail connector protein
terminase small subunit
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
integrase
cro protein
helix-turn-helix dna binding domain protein
chitosanase
dna polymerase/primase
terminase large subunit
atp-dependent helicase
hth dna binding domain protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
head-to-tail connector complex protein
head-to-tai

repressor
terminase large subunit
scaffold protein
terminase small subunit
repressor
small terminase
tape measure
minor tail subunit
tail protein
large terminase
amidase
sigma factor like
endonuclease
scaffold protein
major head protein
head-to-tail connector protein
dsdna helicase
endonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
dna polymerase/primase
drpa-like dna processing chain a
endolysin, protease m23 domain
terminase large subunit
helix-turn-helix dna binding domain protein
terminase small subunit
parb-like nuclease domain protein
integrase
rect-like ssdna binding protein
dna-directed rna polymerase subunit
relb-like antitoxin
rele-like toxin
dna recombinase
terminase large subunit
head-to-tail connector protein
terminase small subunit
integrase
hth dna binding protein
hth dna binding protein
thyx
hth dna binding protein
endonuclease
phosphoribosyltransferase
dnab-like helicase
dcmp deanimase
dnab-like helicase
termi

helix-turn-helix dna binding domain protein
dsdna helicase
head-to-tail connector protein
head-to-tail connector protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
helicase loader
dsdna helicase
dnaj-like chaperonin
dna recombinase
helix-turn-helix dna binding domain protein
aaa atpase
nucleotidyltransferase
helix-turn-helix dna binding domain protein
lysm-like endolysin
terminase large subunit
terminase small subunit
integrase
hth dna binding protein
hth dna binding protein
ribonucleoside-diphosphate reductase
hth dna binding protein
homing endonuclease
phosphoribosyltransferase
dsdna helicase
transmembrane protein
helix-turn-helix dna binding domain protein
cas4 family nuclease
terminase large subunit
scaffold protein
terminase small subunit
parb-like nuclease domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidas

putative portal protein
putative dna primase
putative dna primase
putative dna helicase
putative exonuclease
putative endonuclease
putative scaffold protein
putative major head protein
putative excinuclease
putative major tail protein
putative tape measure protein
putative minor tail subunit
putative protease
putative collagen-like protein
putative terminase
putative amidase
putative holin
putative sigma factor
putative portal protein
putative dna primase
putative dna primase
putative dna helicase
putative exonuclease
putative endonuclease
putative scaffold protein
putative major head protein
putative excinuclease
putative major tail protein
putative tape measure protein
putative minor tail subunit
putative protease
putative collagen-like protein
putative terminase
putative amidase
putative holin
putative sigma factor
putative portal protein
putative dna primase
putative dna primase
putative dna helicase
putative exonuclease
putative endonuclease
putative scaffold protein
putative majo

queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna binding domain protein
terminase large subunit
terminase small subunit
d-ala-d-ala carboxypeptidase
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
imm-like protein
head-to-tail connector complex protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
tapemeasure
hnh domain protein
integrase
sigma-k factor
endovii
nrdh
dnab-like protein
hth dna binding domain protein
dna binding domain protein
helix-turn-helix dna binding protein
terminase large subunit
scaffold protein
terminase small subunit
tapemeasure protein
baseplate j
peptidyl-trna hydrolase
d-ala-d-ala carboxypeptidase
argd
dnac
dnab-like helicase
j domain protein
dna polymerase iii alpha
if1 protein
reca
rusa
ro protein
pnk
ser/thr 

helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
paps reductase-like protein
helix-turn-helix dna binding protein
clp protease
capsid protein
polymerase iii epsilon
reca
major tail subunit
endovii
tapemeasure protein
lsr2
terminase small subunit
integrase
terminase large subunit
hnhc protein
dnab
pnk
whib/hnh endonuclease
helix-turn-helix dna binding domain protein
adp-ribosyltransferase domain and muf-like fusion protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
tapemeasure protein
dna polymerase iii alpha subunit
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
integrase
hth dna binding protein
hth dna binding protein
dsdna helicase
queuine trna-ribosyltransfera

lysm domain protein
terminase large subunit
exonuclease/helicase
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
d-ala-d-ala carboxypeptidase
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
imm-like protein
terminase large subunit
single-stranded dna binding protein
dnaq-like protein
reca
major tail subunit
tapemeasure protein
nrdh
lsr2
integrase
erf
dnab
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
phosphodiesterase
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
rusa-like resolvase
helix-turn-helix dna binding domain protein
major tail subunit
tapemeasure protein
putative exo
putative excisionase
integrase
head 

terminase small subunit
terminase large subunit, atpase domain
terminase large subunit, nuclease domain
lipase
integrase
helix-turn-helix dna-binding domain protein
integrase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
ssdna-binding protein
dna helicase/methylase
helix-turn-helix dna-binding domain protein
dna polymerase iii beta subunit
dna binding domain protein
rusa-like resolvase
queuine-trna ribosyltransferase
deacetylase
hth dna binding protein
rnase e
terminase large subunit
capsid & capsid maturation protease
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
hnh endonuclease domain protein
tapemeasure protein
minor tail subunit
minor tail subunit
carboxypeptidase
integrase
recombination endonuclease vii
nrdh-like protein
dnab-like helicase
putative trans-sialidase
dnae-like dna polymerase iii
dnae-like dna polymerase iii
queuine trna-ribosyltransferase
helix-tu

adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
capsid morphogenesis protein
lysin
hth dna binding domain protein
resolvase
dna translocase
terminase large subunit
ssdna binding domain protein
parb-like nuclease domain protein
rnase
capsid protein
parb-like nuclease domain protein
hth dna binding domain protein
paps reductase
tapemeasure protein
parb-like domain protein
hth dna binding domain protein
host nuclease inhibitior
atpase
hth dna binding domain protein
hth dna binding domain protein
hth dna binding domain protein
hth dna binding domain protein
hnh endonuclease domain protein
hth dna binding domain protein
amidoligase
glutamine amidotransferase domain protein
homing endonuclease
terminase small subunit
integrase
helix-turn-helix dna binding protei

terminase large subunit
tapemeasure
integrase
dna polymerase 1
rdf protein
endovii
nrdh
dnab-like helicase
repressor
dnaj-like protein
ser/thr kinase
terminase small subunit
d-ala-d-ala-decarboxylase
terminase large subunit
dnaq-like exonuclease
hth dna binding protein
integrase
cro protein
nkf protein
terminase small subunit
dna replication protein
nkf protein
nkf protein
nkf protein
nkf protein
hth dna binding protein
hth dna binding protein
whib family transcription regulator
hnh endonuclease domain protein
dna endonuclease
dna methylase domain protein
terminase small subunit
terminase large subunit, atpase domain
terminase large subunit, nuclease domain
helix-turn-helix dna-binding domain protein
integrase
helix-turn-helix dna-binding domain protein
integrase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
ssdna-binding protein
dna helicase/methylase
helix-turn-helix dna-binding domain protein
dna p

n-acetyltransferase
d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
erf family ssdna binding protein
ku-like dsdna break binding protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
terminase large subunit
hicb-like antitoxin
hica-like toxin
rect-like ssdna binding protein
tail lysozyme
baseplate j
peptidyl-trna hydrolase
d-alanyl-d-alanine carboxyp

terminase large subunit
terminase small subunit
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
lysin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
head-to-tail connector protein
head-to-tail connector protein
scaffold protein
capsid protein
tail scaffold protein
tail scaffold protein
tapemeasure protein
nrdc
dna topoisomerase primase
dnab-like helicase
putative repressor
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
integrase
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
helix

homing endonuclease
nucleotidyltransferase
band-7-like membrane protein
lysm-like endolysin
dna bridging protein
dnae-like dna polymerase iii alpha
head-to-tail connector
head-to-tail connector
head-to-tail connector
integrase
cytidine deaminase
hnh dna binding protein
hth dna binding domain protein
glutaredoxin-like protein
minor tail subunit
esterase/lipase
dnab-like helicase
helix-turn-helix dna binding domain protein
integrase
cytidine deaminase
hth dna binding protein
hth dna binding domain protein
endovii
phosphoribosyltransferase
dnab-like helicase
sprt-like protein
homing endonuclease
helix-turn-helix dna-binding domain protein
terminase large subunit
terminase small subunit
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding protein
terminase large subunit
terminase large subunit
helix-turn-helix dna-binding domain protein
terminase small subunit
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna-binding d

band7-like membrane protein
cmp deaminase
riia-like protector protein
riib-like protein
lsr2-like protein
nucleotidyltransferase
lysm-like endolysin
tapemeasure protein
dsdna helicase
dna polymerase iii alpha
dna bridging protein
dna recombinase
terminase large subunit
terminase associated hnh endonuclease
nucleotidyltransferase
head-to-tail connector complex protein
head-to-tail connector complex protein
dna polymerase iii alpha
dna polymerase iii alpha
resolvase
putative tail chaperone protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
tapemeasure protein
helix-turn-helix dna-binding domain protein
dnaq-like exonuclease
integrase
rele-like toxin
relb-like antitoxin
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
terminase small subunit
merr-like he

terminase small subunit
terminase large subunit
integrase
rect-like ssdna binding protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna recombinase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
head-to-tail connector protein
lysin
dsdna helicase
endonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
terminase large subunit
dnmp kinase
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding protein
ribbon-helix-helix 

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
head-to-tail connector protein
head-to-tail connector protein
integrase
deoxcytidylate deaminase
hth dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
helix-turn-helix dna binding domain protein
dnaq-like exonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
cas4-like exonuclease
erf family ssdna binding protein
dnab-like helicase
terminase large subunit
helix-turn-helix dna binding domain protein
integrase
merr-like hth dna binding protein
head-to-tail connector complex protein
head-to-tail connector complex protein
lysm-like endolysin
dsdna helicase
endonuclease
helix-turn-hel

terminase large subunit
hypothetical function
cas4-like exonuclease
head-to-tail connector complex protein
head-to-tail connector complex protein
head-to-tail connector complex protein
terminase small subunit
hth dna binding protein
terminase large subunit
hth dna binding protein
integrase
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
phospholipase
helix-turn-helix dna binding domain protein
lysm-like endolysin
helix-turn-helix dna binding domain protein
terminase small subunit
dnae-like dna polymerase iii
terminase large subunit
terminase large subunit
helix-turn-helix dna-binding domain protein
terminase small subunit
hth dna binding protein
dna polymerase iii subunit
dna recombinase
helix-turn-helix dna binding domain protein
head-to-tail connector protein
hth dna binding protein
membrane domain protein
membrane domain protein
dna bridging protein
membrane domain protein
integrase
membrane domain protein
membr

dna methyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
nicotinamide riboside transporter
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
glycerophosphodiester phosphodiesterase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
putative zinc peptidase
major tail subunit
tapemeasure
queuine trna-ribosyltransferase
integrase y-int
hth dna binding protein
hth dna binding protein
whib-like protein
terminase small subunit
dnaq-like protein
terminase large subunit
nrdh-like protein
rusa-like protein
putative pe family protein
sprt-like protein
rtcb-like protein
ycfa-like protein
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosucc

helix-turn-helix dna binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
terminase large subunit
tapemeasure protein
terminase small subunit
dnab-like dna helicase
helix-turn-helix dna binding domain protein
glycosyl hydrolase
dnaq-like dna polymerase iii subunit
parb-like nuclease domain protein
head-to-tail connector protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
single-stranded dna binding protein
dna polymerase iii alpha
dna polymerase iii alpha
dna recombinase
dna binding domain protein
terminase large subunit
terminase associated hnh endonuclease
phosphohydro

head-to-tail connector protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
terminase large subunit
head-to-tail connector protein
head-to-tail connector protein
tapemeasure protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase small subunit
nucleotidyltransferase
cro protein
terminase large subunit
erf family ssdna binding protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
terminase small subunit
terminase large subunit
integrase
helix-turn-helix dna binding domain protein
cro protein
helix-turn-helix dna binding domain protein
resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
hth dna binding protein
helix-turn-helix dna binding domain protein
terminase small subunit
integrase
helix-turn-helix dna binding do

rusa-like resolvase
helix-turn-helix dna binding domain protein
dnaq-like dna polymerase iii subunit
lysa
terminase large subunit
tapemeasure protein
dcmp deaminase
hth domain protein
terminase small subunit
endovii
sprt-like protein
whib transcriptional regulator
terminase large subunit
minor tail subunit
minor tail subunit
minor tail subunit
d-ala-d-ala-carboxypeptidase
minor tail subunit
minor tail subunit
hth dna binding protein
dna polymerase iii
ku protein
parb-like protein
d-ala-d-ala carboxypeptidase
dnaj-like chaperonin
rusa-like resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
terminase small subunit
terminase large subunit
lysin a, peptidase domain
helix-turn-helix dna binding domain protein
integrase
dna binding domain protein
helix-turn-helix dna binding domain p

In [135]:
unique, counts = np.unique(non_corresponding, return_counts=True)

In [127]:
zipped = zip(unique, counts) 
  
# Converting to list 
zipped = list(zipped) 
  
unapproved_functions = sorted(zipped, key = lambda x: x[1], reverse=True)

In [162]:
for target_function, count in unapproved_functions[:100]:
    phams = list(df_genes[df_genes["Function"] == target_function]["Pham"])
    official_functions = {}
    for pham in phams:
        functions_pham = list(df_genes[df_genes["Pham"] == pham]["Function"])
        for func in functions_pham:
            if func in approved_functions:
                if func not in official_functions:
                    official_functions[func] = 1
                else:
                    official_functions[func] += 1
    
    out_function = find_approved_function(target_function, official_functions.keys())
    
    if out_function in official_functions:
        print("APPROVED")
    else:
        print("NOT FOUND")
    print()


input: helix-turn-helix dna binding domain protein
cleaned function: ['helix-turn-helix', 'dna', 'binding', 'domain', 'protein']
conversion: helix-turn-helix dna binding domain 0.02127659574468085
APPROVED

input: terminase large subunit
cleaned function: ['terminase', 'large', 'subunit']
conversion: terminase, large subunit 0.02857142857142857
APPROVED

input: helix-turn-helix dna binding protein
cleaned function: ['helix-turn-helix', 'dna', 'binding', 'protein']
conversion: helix-turn-helix dna binding domain 0.06255318648110038
APPROVED

input: terminase small subunit
cleaned function: ['terminase', 'small', 'subunit']
conversion: terminase, small subunit 0.02857142857142857
APPROVED

input: integrase
cleaned function: ['integrase']
conversion: tyrosine integrase 0.3333333333333333
APPROVED

input: helix-turn-helix dna-binding domain protein
cleaned function: ['helix-turn-helix', 'dna-binding', 'domain', 'protein']
conversion: helix-turn-helix dna binding domain 0.09088571096733604


input: n-acetyltransferase
cleaned function: ['acetyltransferase']
conversion: acetyltransferase 0.0
APPROVED

input: nrdh
cleaned function: ['nrdh-like', 'glutaredoxin']
conversion: nrdh-like glutaredoxin 0.0
APPROVED

input: ssdna binding domain protein
cleaned function: ['ssdna', 'binding', 'domain', 'protein']
conversion: ssdna binding protein 0.058823529411764705
APPROVED

input: dna polymerase iii (alpha)
cleaned function: ['dna', 'polymerase', 'iii', '(', 'alpha', ')']
conversion: dnae-like dna polymerase iii (alpha) 1.0
APPROVED

input: recb-like exonuclease
cleaned function: ['recb-like', 'exonuclease']
conversion: exonuclease 1.0
APPROVED

input: thyx
cleaned function: ['thyx-like', 'thymidylate', 'synthase']
conversion: thyx-like thymidylate synthase 0.0
APPROVED

input: galactosaminyltransferase
cleaned function: ['galactosaminyltransferase']
conversion: glycosyltransferase 1.0
APPROVED

input: lysm domain protein
cleaned function: ['lysm-like', 'domain', 'protein']
convers