Date Created: 09/27/20
## Goal of Notebook: Manual Cleaning Pipeline
Create manual system to clean function ambiguity

In [1]:
# importing the requests library 
import requests 
import pandas as pd
from random import sample 
import matplotlib
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns                                                             
import sys
import scipy
import nltk

In [2]:
# Cleaning step: Lowercase
df_approved_functions = pd.read_csv("Approved_Functions.csv")
df_approved_functions = df_approved_functions.dropna(subset=["Approved Function"])
df_approved_functions.head()

approved_functions = list(df_approved_functions["Approved Function"])
approved_functions = [i.lower() for i in approved_functions]
approved_functions

['terminase, small subunit',
 'terminase',
 'terminase, large subunit',
 'terminase, large subunit (atpase domain)',
 'terminase, large subunit (nuclease domain)',
 'portal protein',
 'portal and muf-like fusion protein',
 'scaffolding protein',
 'capsid maturation protease',
 'major capsid protein',
 'minor capsid protein',
 'muf-like minor capsid protein',
 'capsid decoration protein, lamd-like',
 'capsid maturation protease and muf-like fusion protein',
 'major capsid and protease fusion protein',
 'head fiber protein',
 'head-to-tail adaptor',
 'head-to-tail stopper',
 'tail terminator',
 'major tail protein',
 'tail assembly chaperone',
 'tape measure protein',
 'minor tail protein',
 'minor tail protein, d-ala-d-ala carboxypeptidase',
 'tail sheath protein',
 'tail fiber',
 'tailspike protein',
 'tail needle protein',
 'baseplate j protein',
 'tail tube protein',
 'baseplate wedge protein',
 'capsid decoration protein',
 'lysin a',
 'lysin a, protease m15 domain',
 'lysin a, prot

In [3]:
# func_units = []
# for i in approved_functions:
#     func_units += nltk.word_tokenize(i)

# fig, ax1 = plt.subplots(figsize = (20,6))
# plt.hist(func_units)
# plt.xticks(rotation='vertical')
# plt.show()

In [4]:
filler_words = ["domain", "protien", 'DNA-binding']
acronyms = {"hth": "helix-turn-helix", "tmp": "tape measure protien","rdf":"recombination directionality factor"}

Create a conversion list:

In [5]:
df_genes = pd.read_csv("all_phage_genes.csv")
df_genes = df_genes.applymap(str)
df_genes["GeneNumber"] = df_genes["GeneNumber"].apply(pd.to_numeric)
df_genes["Function"] = df_genes["Function"].str.lower()
df_genes = df_genes.applymap(str)
df_genes.describe(include = 'all')
df_genes.head()

Unnamed: 0,GeneID,HostStrain,Cluster,Pham,Function,Translation,Orientation,PhageName,GeneNumber
0,20ES_CDS_1,Mycobacterium,A2,36676,,MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPEN...,F,20ES,1
1,20ES_CDS_10,Mycobacterium,A2,34452,lysin b,MSLQVGSSGELVNRWIRVMKARFASYAGKLKEDGYFGLDDKAVQQE...,F,20ES,10
2,20ES_CDS_11,Mycobacterium,A2,34196,terminase,MSLENHHPELAPSPPHIIGPSWQRTVDGSWHLPDPKMTLGWGVLKW...,F,20ES,11
3,20ES_CDS_12,Mycobacterium,A2,37970,portal protein,MTAPLPGQEEIPDPAIARDEMISAFDDAVKNLKINTSYYEAERRPE...,F,20ES,12
4,20ES_CDS_13,Mycobacterium,A2,21454,capsid maturation protease,MITAAVAAYVQRFASMFTGPALSLGEWARFLQTLFPEVQRRYAQAA...,F,20ES,13


In [6]:
non_corresponding = []
for index, row in df_genes.iterrows():
    function = row["Function"]
    if function not in approved_functions and function != "nan":
        print(row["Function"])
        non_corresponding.append(row["Function"])

scaffold protein
major capsid subunit
major tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
para
parb
terminase small subunit
thyx-like protein
endovii
esterase/lipase
hth binding protein
dnab-like helicase
recb-like protein
small terminase subunit
capsid protein
virion protein
major tail subunit
tapemeasure protein
virion protein
minor tail subunit
virion protein
large terminase subunit
virion protein
integrase
repressor
hth binding protein
dnaq
rece like protein
rect
putative hth binding protein
ruvc
whib
parb-like protein
capsid protein
major tail subunit
queuine-trna ribosyltransferase
tapemeasure protein
minor tail subunit
minor tail subunit
minor tail subunit
terminase small subunit
hth binding protein
terminase large subunit
putative holin
type iii restriction helicase
primase/helicase
ruvc
pe/ppe-like protein
terminase large subunit
scaffold protein
capsid protein
tail protein
minor tail subunit
minor tail subunit
para
hth dna binding protein
terminase sma

helix-turn-helix dna binding protein
scaffold protein
terminase small subunit
minor tail subunit
cytidine deaminase
hth dna binding domain protein
terminase large subunit
arda-like antirestriction protein
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
parb-like nuclease domain protein
adp-ribosyltransferase domain and muf-like fusion protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
dna recombinase
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
integrase
rele-like toxin
relb-like antitoxin
cro protein
helix-turn-helix dna binding domain protein
mpme 1 protein
merr-like hth

helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
hu-like domain dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hica-like toxin
head-to-tail connector protein
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
ribbon-helix-helix dna binding domain protein
transcriptional regulator
queuine-trna ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
quee-like queosine biosynthesis protein
endolysin, l-ala-d-glu peptidase domain
endolysin, n-acetylmuramoyl-l-alanine amidase domain
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding doma

helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
terminase small subunit
terminase large subunit
lysin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
head-to-tail connector protein
head-to-tail connector protein
terminase small subunit
major tail subunit
terminase large subunit
toxin
integrase
maze antitoxin
hth dna-binding domain protein
hth dna-binding protein
resolvase
hth dna-binding domain protein
head-to-tail connector protein
head-to-tail connector protein
terminase small subunit
lysm-like endolysin
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
head-to-tail connector complex protein
dna recombinase
parb-like nuclease domain protein
helix-turn-he

membrane domain protein
dna recombinase
recb-like exonuclease
helix-turn-helix dna binding protein
dnaq-like dna polymerase iii subunit
paps reductase-like protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
capsid morphogenesis protein
helix-turn-helix dna binding domain protein
dnaj-like chaperonin
asc-1 transcription coactivator
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
parb-like nuclease domain protein
ribbon-helix-helix dna binding protein
hica-like toxin
dna recombinase
parb-like nuclease domain protein
queuine-trna ribosyltransferase
quec-like queosine biosynthesis protein
qued-like queosine biosynthesis protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
quee-like queosine biosynthesis protein
ribbon-helix-helix dna binding domain protein
hica-like toxin
dna recombinase
d-ala-d-ala carb

terminase small subunit
terminase large subunit
integrase
integrase
helix-turn-helix dna binding protein
dna binding domain protein
helix-turn-helix dna binding protein
ssdna binding domain protein
dna binding domain protein
helix-turn-helix dna binding protein
terminase large subunit
head-to-tail connector
terminase small subunit
integrase
helix-turn-helix dna-binding domain protein
dsdna helicase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
dnaj-like chaperonin
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
merr-like hth dna binding protein
dna polymerase iii subunit
terminase small subunit
terminase large subunit
rusa-like resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hicb-like antitoxin
hica-like toxin
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-h

lysm-like endolysin
dnae-like dna polymerase iii
head-to-tail connector protein
integrase
helix-turn-helix dna binding domain protein
dsdna helicase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
terminase small subunit
head-to-tail connector protein
head-to-tail connector protein
major tail subunit
terminase large subunit
integrase
hth dna binding protein
hth dna binding domain protein
dna polymerase iii beta subunit
dna polymerase iii beta subunit
hth dna binding domain protein
resolvase
hth dna binding 

lysm-like endolysin
merr-like helix-turn-helix dna binding domain protein
dnae-like dna polymerase iii
hica-like toxin
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
terminase large subunit
rect-like ssdna binding protein
head-to-tail connector protein
dcmp deaminase
dnab-like helicase
helix-turn-helix dna binding domain protein
dna bridging protein
integrase
dna polymerase iii alpha
dnmp kinase
terminase large subunit
peptidyl-trna hydrolase
d-ala d-ala carboxypeptidase
dnaj-like chaperonin
rusa-like resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna-binding domain protein
ribbon-helix-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
endonuclease
tapemeasure
baseplate j
d-ala-d

parb-like nuclease domain protein
n-acetyltransferase
parb-like nuclease domain protein
beta-lactamase
dnaj-like chaperonin
rusa-like resolvase
nicotinamide riboside transporter
pura-like adenylosuccinate synthetase
band-7-like membrane protein
nucleotidyltransferase
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dnaq-like endonuclease
integrase
helix-turn-helix dna binding domain protein
g-i-y-y-i-g endonuclease
g-i-y-y-i-g endonuclease
terminase small subunit
terminase large subunit
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
dna poly

terminase large subunit
hth dna binding protein
terminase small subunit
o-methyl transferase
major tail subunit
d-ala-d-ala-carboxypeptidase
lysa
lysb
hth dna binding protein
dna pol iii beta subunit
dna binding domain protein
ku protein
endo vii protein
parb-like protein
aaa atpase
capsid & capsid maturation protease
dna polymerase/primase
atp-dependent helicase
terminase large subunit
recb-like exonuclease
hth dna binding domain protein
head-to-tail connector complex protein
head-to-tail connector complex protein
head-to-tail connector complex protein
tail fiber protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
erf family ssdna binding protein
terminase small subunit
ssdna-binding protein
rusa-like resolvase
terminase large subunit
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
d-ala-d-ala-carboxypeptidase
terminase large subunit
exonuclease/h

lysm-like endolysin
helix-turn-helix dna binding domain protein
dnae-like dna polymerase iii
helix-turn-helix dna binding domain protein
dnaq-like dna polymerase iii subunit
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
terminase large subunit
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding protein
terminase small subunit
terminase large subunit
integrase
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
nucleotidyltransferase
cro protein
terminase large subunit
erf family ssdna binding protein
rusa-like resolvase
dnaq-like dna polymera

terminase large subunit
cro protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
dnaq-like exonuclease
integrase
rele-like toxin
relb-like antitoxin
cro protein
helix-turn-helix dna binding domain protein
mpme 1 protein
merr-like helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
hica-like toxin
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
tapemeasure protein
integrase
helix-turn-helix dna binding protein
dsdna helicase
helix-turn-helix dna binding protein
terminase small subunit
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
queuine-trna ribosyltransferase
helix-turn-hel

terminase small subunit
endonuclease
hth dna binding protein
queuine trna-ribosyltransferase
hth dna binding protein
integrase
cro protein
terminase small subunit
hth dna binding protein
dna polymerase iii subunit
terminase large subunit
resolvase
hth dna binding protein
hth dna binding protein
dna binding domain protein
terminase small subunit
terminase large subunit
head-to-tail connector protein
tapemeasure protein
dna polymerase iii alpha subunit
5'-nucleotidase
toxin
antitoxin
arda-like antirestriction protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
nucleotidyltransferase
head-to-tail connector protein
dsdna helicase
dna bridging protein
integrase
dna polymerase iii alpha
terminase large subunit
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding p

dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
paps reductase-like domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
terminase small subunit
major tail subunit
terminase large subunit
integrase
xis
rece
rect
ftsk
rusa
hth dna binding protein
hth dna binding protein
hth dna binding protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
ribbon-helix-helix dna binding domain protein
terminase small subunit
terminase large subunit
dna polymerase iii subunit
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
minor tail subunit
minor tail subunit
minor tail subunit
d-ala-d-ala carboxypeptidase
integrase
rdf protein
endovii
nrdh
dnab-like helicase
dnaj-like protein
terminase small subunit
termin

queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hth dna binding protein
hnh homing endonuclease
dnaq-like exonuclease
reca
hth dna binding protein
head-tail connector
major tail subunit
tapemeasure
d-ala-d-ala carboxypeptidase
hth dna binding protein
nrdh-redoxin
integrase
dna binding domain protein
terminase small subunit
erf-like protein
dnab-like helicase
terminase large subunit
hth dna binding protein
whib
clp protease
primase/polymerase
whib
terminase large subunit
prohead protease
capsid
major tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
d-ala-d-ala carboxypeptidase
minor tail subunit
minor tail subunit
lysa
lysb
dna polymerase iii beta subunit
ku protein
parb-like protein
aaa atpase
capsid protein
head-to-tail connector protein
tapemeasure protein
tail protein
tail protein
helix-turn-helix dna binding domain protein
nrdh glutaredoxin
terminase small subunit
dnaq-like dna polymerase i

terminase large subunit
merr-like helix-turn-helix dna binding protein
major tail subunit
tapemeasure
intergrase
cro
whib
dnaq
nrdh
primase
rusa
sprt
portal
rtcb
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
helx-turn-helix dna binding protein
hica-like toxin
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
tapemeasure protein
terminase large subunit
dna exonuclease
dsdna helicase
helix-turn-helix dna binding protein
n-acetyltransferase
d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain p

cytidine deaminase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
terminase large subunit
helix-turn-helix dna binding protein
terminase small subunit
helix-turn-helix dna-binding domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
hth dna binding domain protein
parb-like nuclease domain protein
dnaq-like exonuclease
hth dna binding domain protein
head-to-tail connector protein
head-to-tail connector protein
terminase large subunit
antitoxin
toxin
hth dna binding protein
integrase
hth dna binding protein
hth dna binding protein
resolvase
head-to-tail connector protein
head-to-tail connector protein
head-to-tail connector protein
helix-turn-helix dna binding domain protein
arda-like antirestriction protein
helix-turn-helix dna binding protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-tur

d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
terminase small subunit
major tail subunit
tapemeasure protein
terminase large subunit
rece
rect
ruvc
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
head-to-tail connector complex protein
head-to-tail connector complex protein
head-to-tail connector complex protein
integrase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
terminase small subunit
helix-turn-

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna recombinase
helix-turn-helix dna binding domain protein
dnaq-like exonuclease
terminase small subunit
cas4-like exonuclease
erf family ssdna binding protein
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
terminase large subunit
helix-turn-helix dna-binding domain protein
terminase small subunit
head-to-tail connector complex protein
terminase large subunit
lipase
brnt-like toxin
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
dnaq-like exonuclease
hth dna binding domain protein
head-to-tail connector complex protein
head-to-tail connector complex protein
dna recombinase
terminase small subunit
majjor tail protein
tail asssembly chaperone
terminase large subunit
capsid matuarion protease
rec-e-like exonuclease
rect-like ssdna binging protein
asc-1 transcription coactivator
membraine protein
terminase large sub

helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
dna polymerase iii subunit
recombinase
head-to-tail connector protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
dna bridging protein
terminase small subunit
integrase
erf family ssdna binding protein
terminase large subunit
dsdna helicase
helix-turn-helix dna binding protein
putative major tail protein
putative tail assembly protein
putative tail assembly protein
putative tape measure protein
putative large terminase subunit
putative lysin
putative holin
putative portal protein
putative integrase
putative structural protein
putative whib family transcription factor
putative terminase
putative structural protein
putative structural protein
putative methyltransferase
putative major tail protein
putative tail assembly protein
putative tail assembly protein
putative tape measure protein
putative peptidase
putative lysin
putative lysin


dnaj-like chaperonin
dnae-like dna polymerase iii
rusa-like resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
terminase large subunit
head-to-tail connector protein
head-to-tail connector protein
tapemeasure protein
helix-turn-helix dna binding domain protein
terminase small subunit
pre-tmp frameshift protein
pre-tmp frameshift protein
tail protein
galactose binding protein
integrase
heat shock protein
dna recombination protein
endonuclease
terl
prohead protease
capsid protein
main tail protein
tail protein
tail protein
putative lysin
terminase large subunit
tail fiber protein
vrr-nuc domain containing protein
putative deaminase
putative cutinase
phage-encoded dctp pyrophosphatase
guanylate

helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
terminase small subunit
major tail subunit
d-ala-d-ala carboxypeptidase
terminase large subunit
dna polymerase iii subunit
integrase
hth dna-binding-domain protein
whib
hindiii-like dna methylase
merr-like hth dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
terminase large subunit
seine integrase
terminase small subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
methytransferase
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
primase/polymerase
tapemeasure
d-ala-d-ala carboxypeptidase
asp-rich protein
nucleotidase
fabg-like protein
dna polymerase iii alpha subunit
thyx
terminase large subunit
helix-turn-helix dna-binding domain p

dsdna helicase
helix-turn-helix dna binding domain protein
terminase large subunit
dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
erf family ssdna binding protein
dnab-like dna helicase
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
integrase
merr-like helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
erf family ssdna binding protein
terminase large subunit
helix-turn-helix dna binding domain protein
major tail subunit
tail chaperone scaffold protein
tail chaperone scaffold protein
tapemeasure protein
integrase
repressor
rece
rect
ruvc
mpme1 protein
scaffold protein
hnh protein
capsid protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix

helix-turn-helix dna binding domain protein
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
erf family ssdna binding protein
terminase large subunit
helix-turn-helix dna binding protein
terminase large subunit
head-to-tail connector protein
tapemeasure protein
integrase
terminase small subunit
mre11 double strand break endo/exonuclease
dsdna helicase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dnaj-like chaperonin
dna recombinase
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
major capsid subunit
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hica

helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding protein
hica-like toxin
merr-like helix-turn-helix dna binding domain protein
recb-like exonuclease
head-to-tail connector complex protein
dnaq-like exonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dnaj-like chaperone
helix-turn-helix dna binding domain pro

hth dna binding domain protein
rdf protein
esterase/lipase
dnab-like helicase
repressor
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
lysm-like endolysin
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna-binding domain protein
lysm-like peptidoglycan-binding protein
rect-like ssdna binding protein
helix-turn-helix dna-binding domain protein
ssdna-binding protein
helix-turn-helix dna-binding domain protein
rusa-like resolvase
helix-turn-helix dna-binding domain protein
dna polymerase iii sliding clamp
terminase small subunit
terminase large subunit
rect-like ssdna bin

integrase
rect-like ssdna binding protein
mpme1 protein
terminase large subunit
terminase small subunit
integrase
helix-turn-helix dna binding domain protein
dna directed rna polymerase subunit alpha
helix-turn-helix dna binding protein
sprt-like protein
terminase large subunit
parb
hth dna binding domain protein
terminase small subunit
endovii
sprt-like protein
rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
parb-like nuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
terminase large subunit
terminase large subunit
putative head assembly protein
scaffold protein
capsid protein
whib/hnh endonuclease
tail scaffold protein
tail scaffold protein
tapemeasure protein
putative tail protein
tail fiber-like protein
dna topoisomerase-primase
endovii
dnab-like helicase
helix-turn-helix dna bi

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
integrase
helix-turn-helix dna binding domain protein
dna-directed rna polymerase subunit alpha
ribonuclease reductase
helix-turn-helix dna binding protein
sprt-like protein
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
dna polymerase iii subunit
integrase
cro protein
hth dna binding protein
hth dna binding protein
dna recombinase
dna helicase/methylase
hth dna binding protein
dna cytosine methylase
dna cytosine methyla

erf family ssdna binding protein
rusa-like resolvase
dna polymerase iii subunit
dpra-like single stranded dna binding protein
dna recombinase
integrase
lysin a, peptidase domain
lysina, hydrolase domain
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
helx-turn-helix dna binding protein
hica-like toxin
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
cro protein
parb-like nuclease domain protein
lamd-like capsid decoration protein
ribbon-helix-helix dna binding protein
rusa-like resolvase
rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding

terminase small subunit
nucleotidyltransferase
terminase large subunit
cro protein
band-7-like membrane protein
erf family ssdna binding protein
rusa-like resolvase
hnh domain protein
terminase large subunit
scaffold protein
terminase small subunit
tapemeasure protein
integrase
cytidine deaminiase
dna polymerase iii
hth domain protein
rdf protein
dnab-like helicase
repressor
dna methyltransferase
sprt
terminase large subunit
helix-turn-helix dna binding domain protein
acetyltransferase domain protein
terminase small subunit
terminase large subunit
terminase small subunit
major tail subunit
tapemeasure protein
terminase large subunit
integrase
repressor
rece
rect
scaffold protein
ruvc
hnh protein
capsid protein
helicase loader
dsdna helicase
dnaj-like chaperonin
dna polymerase iii (alpha)
dna recombinase
resolvase
pura-like adenylosuccinate synthetase
nucleotidyltransferase
band-7-like membrane protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain prote

ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
terminase large subunit
putative head assembly protein
scaffold protein
capsid protein
whib/hnh endonuclease
tail scaffold protein
tail scaffold protein
tapemeasure protein
putative tail protein
tail fiber-like protein
dna topoisomerase-primase
endovii
dnab-like helicase
repressor
head-to-tail connector protein
head-to-tail connector protein
head-to-tail connector protein
vip2-like toxin
dna binding domain protein
dsdna partitioning protein
dsdna partitioning protein
helix-turn-helix dna binding protein
esterease
dsdna helicase
capsid morphogenesis protein
hydroxylase
helix-turn-helix dna binding domain protein
dnaj-like chaperonin
dnae-like dna polymerase iii
rusa-like resolvase
pura-like adenylosuccinate synthetase
band-7-like membrane protein
helix-turn-helix dna binding domain protein
ribbon-he

dna polymerase iii sliding clamp
helix-turn-helix dna binding domain protein
rect-like ssdna binding protein
rusa-like resolvase
parb-like nuclease domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
dna polymerase iii subunit
rect-like ssdna binding protein
terminase large subunit
terminase small subunit
helix-turn-helix dna binding domain protein
head-to-tail connector complex protein
head-to-tail connector protein
helix-turn-helix dna binding domain protein
rna polymerase sigma subunit
dpra-like dna processing chain
imm-like protein
major tail subunit
tapemeasure protein
queuine trna-ribosyltransferase
integrase
hth dna binding domain protein
whib
dnaq-like protein
rusa-like protein
terminase large subunit
sprt-like protein
hnh endonuclease domain protein
chitosanase
head-to-tail connector complex protein
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna

dnaj-like chaperonin
helix-turn-helix dna binding domain protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
head-to-tail connector complex protein
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
tail protein
integrase
hth dna binding protein
tail protein
hth dna binding protein
helix-turn-helix dna binding domain protein
esterase/lipase
dsdna helicase
parb-like nuclease domain protein
helix-turn-helix dna binding protein
exonuclease domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
endolysin, l-ala-d-glu peptidase domain
endolysin, n-acetylmuramoyl-l-alanine amidase domain
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna bindi

helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
dna polymerase iii subunit
helix-turn-helix dna binding protein
terminase small subunit
terminase large subunit
dnaq-like exonuclease
integrase
endonuclease
mpme2 protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dna polymerase iii subunit
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domai

terminase large subunit
integrase
terminase small subunit
helix-turn-helix dna binding domain protein
rna polymerase sigma
tail protein
helix-turn-helix dna binding domain protein
integrase
cro protein
hica-like toxin
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
head-to-tail connector
head-to-tail connector
head-to-tail connector
integrase
helix-turn-helix dna binding protein
dna binding domain protein
helix-turn-helix dna binding domain protein
esterase/lipase
helix-turn-helix dna binding protein
dna binding domain protein
helix-turn-helix dna binding protein
dna polymerase iii subunit
dna recombinase
helix-turn-helix dna binding domain protein
head-to-tail connector protein
tapemeasure protein
helix-turn-helix dna binding domain protein
dna bridging protein
terminase small subunit
membrane domain protein
integrase
erf family ssdna binding protein
terminase large subunit
dsdna helicase
helix-turn-helix dna binding domain protein
helix-tur

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
lysin
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
head-to-tail connector protein
head-to-tail connector protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
scaffold protein
putative structural protein
tapemeasure protein
putative structural protein
tail protein
putative structural protein
putative structural protein
integrase
putative structural protein
endovii
putative structural protein
dnab-like helicase
putative structural protein
rna terminal phosphate cyclase
tyrosine phosphatase
integrase
terminase small subunit
terminase large subunit
dna bridging protein
nucleotidyltransferase
dsdna helicase
dna polymerase iii subunit
terminase large subunit
helix-turn-helix dna binding domain

helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
lipase
integrase
helix-turn-helix dna binding protein
integrase
terminase small subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
parb-like dna binding protein
terminase large subunit
hth dna binding protein
head-to-tail connector protein
dcmp deaminase
dnab-like helicase
transcriptional regulator
lsr2 dna bridging protein
integrase
dna polymerase iii alpha
dnmp kinase
terminase large subunit
n-acetyltransferase
d-ala-d-ala carboxypeptidase
dnaj-like chaperonin
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
tapeme

acetyltransferase domain protein
atp-dependent helicase
helix-turn-helix dna binding domain protein
dna polymerase iii sliding clamp
dnae-like dna polymerase iii
thimidylate synthase
terminase small subunit
terminase large subunit
dnaq-like exonuclease
integrase
cro protein
dna recombinase
merr-like hth dna binding protein
rusa-like resolvase
dna polymerase iii sliding clamp
dna polymerase iii subunit
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like exonuclease
reca-like recombinase
major tail subunit
integrase
terminase small subunit
erf family ssdna binding protein
terminase large subunit
hth dna binding domain protein
terminase small subunit
merr-like helix-turn-helix dna binding protein
terminase large subunit
head-to-tail connector complex protein
head-to-tail connector complex protein
endonuclease
helix-turn-helix dna binding domain protein
terminase small subunit
helix-turn-helix dna binding domain protein
integrase
terminase large subunit
term

merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
terminase small subunit
cytidine deaminase
hth binding domain protein
major tail subunit
tapemeasure
integrase
terminase small subunit
cro protein
dna polymerase iii
dnaq-like exonuclease
nrdh
endonuclease
endonuclease
primase/polymerase
rusa
rtcb
terminase large subunit
terminase small subunit
tapemeasure protein
integrase
helix-turn-helix dna binding domain protein
hth dna binding protein
hth dna binding protein
helix-turn-helix dna binding domain protein
lipase
terminase small subunit
major tail subunit
terminase large subunit
hth dna-binding domain protein
dnaq-like exonuclease
integrase
hth protein
hth dna-binding protein
dna recombinase
hth

dsdna helicase
endonuclease
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
dna polymerase/primase
drpa-like dna processing chain a
endolysin, protease m23 domain
terminase large subunit
helix-turn-helix dna binding domain protein
terminase small subunit
parb-like nuclease domain protein
integrase
rect-like ssdna binding protein
dna-directed rna polymerase subunit
relb-like antitoxin
rele-like toxin
dna recombinase
terminase large subunit
head-to-tail connector protein
terminase small subunit
integrase
hth dna binding protein
hth dna binding protein
thyx
hth dna binding protein
endonuclease
phosphoribosyltransferase
dnab-like helicase
dcmp deanimase
dnab-like helicase
terminase large subunit
type 1 restriction enzyme
rect-like ssdna binding protein
tail assembly protein
tapemeasure protein
minor tail
membrane domain protein
helix-turn-helix dna binding protein
integrase
terminase small subunit
cro protein
helix-turn-helix dna binding pr

pnk
whib
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helicase subunit
terminase small subunit
major tail subunit
hnh homing endonuclease
tapemeasure
terminase large subunit
d-ala d-ala carboxypeptidase
hth dna-binding protein
dna-q like exonuclease
integrase
repressor
cro protein
xis
whib
whib
hth dna-binding protein
hth dna-binding protein
trna methyl transferase
giy-yig homing endonuclease
hth dna-binding protein
clp protease
dnaq-like exonuclease
reca
hnh endonuclease domain protein
major tail subunit
hnh endonuclease domain protein
tapemeasure
d-ala-d-ala carboxypeptidase
nrdh-like protein
lsr2
terminase small subunit
integrase, (y-int)
pentapeptide repeat family protein
terminase large subunit
erf protein
hnh endonuclease domain protein
dnab-like helicase
pnk
whib
terminase small subunit
head-to-tail connector
major tail subunit
terminase large subunit
integrase
whib
rece
rect
dna methyltransferase
hth dna

lysm-like endolysin
endonuclease
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
queuine-trna ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
cro protein
terminase small subunit
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
terminase large subunit
resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dsdna helicase
terminase large subunit
hicb-like antitoxin
hica-like toxin
rect-like ssdna binding protein
helicase loader
dna polymerase iii alpha subunit
putative rna polymerase sigma-70 factor
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
glycohydrolase
integrase
lysm domain

terminase small subunit
terminase large subunit
integrase
rect-like ssdna binding protein
mpme 2 protein
head-to-tail connector complex protein
hica-like toxin
tail protein
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
tapemeasure protein
minor tail subunit
minor tail subunit
baseplate j
hnh endonuclease domain protein
peptidyl-trna hydrolase domain protein
d-ala-d-ala carboxypeptidase
dnac-like protein
dnab-like helicase
dnaj-like domain protein
dna polymerase iii alpha subunit
release factor 1
reca
hnh endonuclease domain protein
rusa-like endodeoxyribonuclease
histidine triad domain protein
rna binding trove domain protein
glycosyl transferase
glycosyl transferase
galactosyl transferase
glycosyl transferase
pura
nrdc-like protein
band 7 protein
hth dna binding domain protein
parb-like protein
thyx-like protein
lysm domain protein
zinc-finger dna binding domain protein
queuine trna-ribosyltransferase
helix-turn-helix dna binding

helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
ssb protein
clp
polymerase ii
reca
capsid subunit
major tail subunit
endovii
tapemeasure protein
ppe protein
nrdh
lsr2-like protein
integrase
small terminase
dnab
terminase large subunit
pnk
whib
endonuclease
dnaj-like chaperonin
ssdna-binding protein
dnae-like dna polymerase iii subunit alpha
lysin a, peptidase domain
helix-turn-helix dna binding protein
hica-like toxin
helix-turn-helix dna binding protein
dna recombinase
terminase large subunit
terminase small subunit
integrase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lipase
terminase large subunit
hicb-like antitoxin
hica-like toxin
rect-like ssdna binding protein
terminase small subunit
terminase large subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain 

nrdh
lsr2
integrase
erf
dnab
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
phosphodiesterase
helix-turn-helix dna binding domain protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
rusa-like resolvase
helix-turn-helix dna binding domain protein
major tail subunit
tapemeasure protein
putative exo
putative excisionase
integrase
head assembly protein
whib
scaffold protein
hnhc protein
capsid protein
head-to-tail connector complex protein
head-to-tail connector complex protein
lysm-like endolysin
dsdna helicase
endonuclease
helix-turn-helix dna binding domain protein
dna bridging protein
terminase small subunit
integrase
thyx thymidylate synthase
dna polymerase iii alpha subunit
terminase large subunit
dnae-like dna polymerase iii
integrase
terminase small subunit
terminase large subunit
holiday junctio

queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding protein
chitosanase
tapemeasure protein
endoysin
dna polymerase/primase
terminase large subunit
atp-dependent helicase
hth dna binding domain protein
dnaq-like dna polymerase iii subunit
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
erf family ssdna binding protein
terminase large subunit
dnab-like dna helicase
helix-turn-helix dna binding domain protein
capsid maturation protease and minor capsid fusion protein
integrase
helix-turn-helix dna binding protein
recombination directionality factor protein
nrdh-like protein
dnab-like helicase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dnaq-like exonuclease
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
dna polymerase iii 

nucleotidyltransferase
terminase large subunit
erf family ssdna binding protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
integrase
helix-turn-helix dna binding domain protein
dsdna helicase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
lysm-like endolysin
helix-turn-helix dna binding domain protein
dnae-like dna polymerase iii
terminase small subunit
terminase large subunit
terminase large subunit
integrase
transcriptional regulator
terminase small subunit
head-to-tail connector complex protein
head-to-tail connector complex protein
terminase large subunit
relb-like antitoxin
rele-like toxin
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
head-to-tail connector complex protein
helix-turn-helix dna binding do

terminase large subunit
helix-turn-helix dna binding protein
terminase large subunit
helix-turn-helix dna-binding domain protein
terminase small subunit
terminase small subunit
terminase large subunit
integrase
terminase small subunit
helix-turn-helix dna binding domain protein
terminase large subunit
head-to-tail connector complex protein
head-to-tail connector complex protein
head-to-tail connector complex protein
integrase
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
hth dna-binding protein
parb-like nuclease domain protein
dnaq-like exonuclease
hth dna-binding protein
head-to-tail connector complex protein
head-to-tail connector complex protein
dna recombinase
terminase small subunit
terminase large subunit
hicb-like antitoxin
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
m

d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm-domain protein
homing endonuclease
atp-dependent helicase
helix-turn-helix dna-binding protein
terminase large subunit
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
erf family ssdna binding protein
helix-turn-helix dna binding protein
n-acetyltransferase
d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase


d-ala-d-ala carboxypeptidase
rna binding domain protein
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
lysm domain protein
terminase small subunit
terminase large subunit
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
scaffold protein
cytidine deaminase
hth dna binding domain protein
helix-turn-helix dna-binding domain protein
drpa-like ssdna binding protein
terminase large subunit
repressor
arda-like antirestriction protein
terminase large subunit
terminase small subunit
integrase
helix-turn-helix dna bindi

homing endonuclease
amidase
n-acetyltransferase
d-ala-d-ala carboxypeptidase
dnaj-like chaperonin
ssdna binding domain protein
dna polymerase iii (alpha)
rusa-like resolvase
galactosaminyltransferase
adenylosuccinate synthetase
nucleotidyltransferase
helix-turn-helix dna binding protein
ribbon-helix-helix dna binding domain protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
lysm-like domain protein
head decoration protein
terminase small subunit
terminase large subunit
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna polymerase iii sliding clamp
dna polymerase iii sliding clamp
rect-like ssdna binding protein
rusa-like resolvase
head-to-tail connector complex protein
helix-turn-helix dna binding domain protein
p60 lipoprotein
dnac
dna polymerase iii alpha subunit
rf1 domain protein
reca
rusa
ro protein
pnk
pura
nrdc
thyx
tapemeasure protein
helix-turn-helix dna bindi

queuine trna-ribosyltransferase
glucosaminyl eacetylase
scaffold protein
tape measure
dcmp deaminase
dnab-like helicase
lsr2-like protein
integrase
thyx
dna polymerase iii subunit alpha
dnmp kinase
hnh domain protein
terminase large subunit
lamd-like capsid decoration protein
helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
tapemeasure protein
helix-turn-helix dna binding domain protein
integrase
cro protein
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
terminase small subunit
endonuclease
terminase large subunit
helix-turn-helix dna binding domain protein
membrane domain protein
hicb-like antitoxin
hica-like toxin
terminase large subunit
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
terminase large subunit
tapemeasure protein
dcmp deaminase
terminase sm

adp-ribosyltransferase domain and muf-like fusion protein
helix-turn-helix dna binding protein
terminase large subunit
clp-like protease
helix-turn-helix dna binding domain protein
dna polymerase iii subunit
helix-turn-helix dna binding domain protein
d-ala-d-ala carboxypeptidase
helix-turn-helix dna binding domain protein
integrase
terminase small subunit
erf family ssdna binding protein
helix-turn-helix dna binding domain protein
integrase
capsid protein
terminase small subunit
terminase large subunit
dnaq-like dna polymerase iii subunit
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
capsid & capsid maturation protease
hnh endonuclease domain protein
dna polymerase/primase
terminase large subunit
atp-dependent helicase
hnh endonuclease domain protein
hnh endonuclease domain protein
recb-like exonuclease
hth dna binding domain protein
dna recombinase
rusa-like resolvase
dna polymerase iii 

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
head-to-tail connector protein
tail protein
tail protein
integrase
hth dna binding protein
dcmp deaminase
hth dna binding protein
putative sigma factor
metallophsphatase
recombination endonuclease vii
esterase/lipase
dnab-like helicase
tapemeasure protein
cytidine deaminase
hth dna binding domain protein
arda-like antirestriction protein
terminase large subunit
helix-turn-helix dna binding domain protein
dehalogenase
capsid morphogenesis protein
head-to-tail connector complex protein
integrase
integrase
cro protein
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
hth dna binding protein
terminase large subunit
lysm-like peptidoglycan-binding protein
baseplate j-like protein
rect-like ssdna binding protein
helix-turn-helix dna binding domain protein
ssdna-binding protein
helix-turn-helix dna binding domain protein
helix-turn-helix

6-pyruvoyl tetrahydropterin synthase
helix-turn-helix dna binding domain protein
hth dna binding protein
7-cyano-7-deazaguanosine
primase/helicase
replication regulatory protein
queuine trna-ribosyltransferase
integrase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
resolvase
helix-turn-helix domain protein
capsid maturation protein
helix-turn-helix dna-binding domain protein
helix-turn-helix dna-binding domain protein
band-7-like membrane protein
riia-like protein
riib-like protein
nucleotidyltransferase
lysm-like endolysin
nucleotidyltransferase
terminase small subunit
head-to-tail connector protein
minor tail subunit
cytidine deaminase
hth dna binding protein
recombination endonuclease vii
dnab-like helicase
terminase large subunit
dna polymerase iii alpha subunit
dna polymerase iii alpha subunit
dna recombinase
hth dna binding protein
band-7 like membrane protein
phosphohydrolase
riia-like protein
riib-like protein
helix-turn-helix dna binding domain prot

helix-turn-helix dna binding domain protein
rusa-like resolvase
helix-turn-helix dna binding domain protein
terminase small subunit
terminase large subunit
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
dna primase/helicae
ribbon-helix-helix dna binding domain oriteub
rnase e
terminase large subunit
terminase small subunit
helix-turn-helix dna-binding domain protein
terminase large subunit
arda-like antirestriction protein
dna polymerase iii subunit
dna recombinase
head-to-tail connector protein
head-to-tail connector protein
tape meaure protein
lsr2 dna bridging protein
integrase
terminase small subunit
erf family ssdna binding protein
dsdna helicase
terminase large subunit
clpp protease
terminase small subunit
terminase large subunit
helix-turn-helix dna binding protein
rect-like ssdna binding protein
queuine trna-ribosyltransferase
helix-turn-helix dna binding domain protein
hth dna binding domain protein
helic

dna polymerase iii
queuine trna-ribosyltransferase
minor tail subunit
helix-turn-helix dna binding domain protein
hth dna binding domain protein
helicase type iii subunit
hica-like toxin
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
merr-like helix-turn-helix dna binding domain protein
queuine trna-ribosyltransferase
minor tail subunit
minor tail subunit
minor tail subunit
minor tail subunit
helix-turn-helix dna binding domain protein
hth dna binding domain protein
rdf protein
pla2 protein
helix-turn-helix dna binding protein
rect-like ssdna binding protein
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
tail protein
tapemeasure
tail protein
integrase
hth dna binding
dcmp deaminase
collagen-like protein
hth dna binding
rdf protein
esterase/lipase
dnab-like helicase
repressor
terminase large subunit
head-to-tail connector protein
head-to-tail connector protein
tapemeasure protein
minor tail subunit
dcmp deaminase
hth dna binding prot

helix-turn-helix dna binding protein
dnae-like dna polymerase iii
hica-like toxin
helix-turn-helix dna binding protein
integrase
helix-turn-helix dna binding protein
helix-turn-helix dna binding protein
dna recombinase
helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
merr-like helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase small subunit
nucleotidyltransferase
terminase large subunit
cro protein
erf family ssdna binding protein
rusa-like resolvase
dnaq-like dna polymerase iii subunit
terminase large subunit
head-to-tail connector protein
head-to-tail connector protein
tapemeasure protein
helix-turn-helix dna binding domain protein
terminase small subunit
head-to-tail connector protein
integrase
helix-turn-helix dna binding domain protein
helix-turn-helix dna 

helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
parb-like nuclease domain protein
lysm-like endolysin
helix-turn-helix dna binding domain protein
queuine trna-ribosyltransferase
integrase
cro protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
whib
dnaq-like exonuclease
terminase small subunit
terminase large subunit
rusa-like resolvase
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
antitoxin
toxin
glycosylase/dna lyase domain protein
n-acetyl transferase
tail lysozyme/baseplate
peptidyl-trna hydrolase
aspartate aminotransferase
dsdna helicase
dnaj-like domain chaperonin
dna polymerase iii alpha
dna recombinase
resolvase
nicotinamide riboside transporter
hnh homing endonuclease
pura-like adenylosuccinate synthetase
band-7 like membrane protein
helix-turn helix dna binding prot

helix-turn-helix dna-binding domain protein
pura-like adenylosuccinate synthetase
helix-turn-helix dna-binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
helix-turn-helix dna binding domain protein
terminase large subunit
tapemeasure protein
merr-like hth dna-binding protein
terminase small subunit
merr-like hth dna-binding protein
n-acetyl transferase
tapemeasure
baseplate j
hnh endonuclease domain protein
peptidyl trna hydrolase domain protein
d-ala d-ala carboxypeptidase
argd
dnac
dnab-like helicase
dnaj
dna polymerase iii alpha subunit
if1 protein
reca
rusa
histidine triad domain protein
ro protein
putative casid decoration, structural
pnk
ser/thr kinase
pura
band 7 protein
nucleotidyltransferase
hth dna binding domain protein
parb-like protein
thyx-like protein
lysm
zinc-finger dna binding domain


In [7]:
unique, counts = np.unique(non_corresponding, return_counts=True)

In [8]:
zipped = zip(unique, counts) 
  
# Converting to list 
zipped = list(zipped) 
  
unapproved_functions = sorted(zipped, key = lambda x: x[1], reverse=True)

In [9]:
unapproved_functions

[('helix-turn-helix dna binding domain protein', 2718),
 ('terminase large subunit', 1395),
 ('helix-turn-helix dna binding protein', 1230),
 ('terminase small subunit', 1051),
 ('integrase', 898),
 ('helix-turn-helix dna-binding domain protein', 525),
 ('head-to-tail connector protein', 384),
 ('rusa-like resolvase', 350),
 ('hth dna binding protein', 333),
 ('tapemeasure protein', 288),
 ('rect-like ssdna binding protein', 273),
 ('parb-like nuclease domain protein', 264),
 ('minor tail subunit', 259),
 ('dna polymerase iii subunit', 257),
 ('hth dna binding domain protein', 233),
 ('cro protein', 219),
 ('head-to-tail connector complex protein', 205),
 ('dnab-like helicase', 196),
 ('dsdna helicase', 183),
 ('major tail subunit', 177),
 ('d-ala-d-ala carboxypeptidase', 170),
 ('merr-like helix-turn-helix dna binding domain protein', 163),
 ('nucleotidyltransferase', 160),
 ('erf family ssdna binding protein', 156),
 ('queuine-trna ribosyltransferase', 156),
 ('queuine trna-ribosyltr

In [10]:
unique_func = np.unique(non_corresponding)

In [11]:
len(unique_func)

1482

In [12]:
import pickle
a_file = open("new_conversion_table.pkl", "rb")
conversion_table = pickle.load(a_file)
# print(conversion_table)
print(len(conversion_table))

0


In [13]:
# conversion_table = {}

In [27]:
count_not_resolved = 0
count = 0
for i in non_corresponding:
    if i in conversion_table:
        count += 1
    else:
        count_not_resolved += 1
print("resolved protiens", count)
print("not resolved protiens",count_not_resolved)

resolved protiens 20165
not resolved protiens 1278


In [24]:
for target_function, count in unapproved_functions:
    if target_function not in conversion_table:
    #     target_function = "tapemeasure protein"#"HNH endonuclease"
        phams = list(df_genes[df_genes["Function"] == target_function]["Pham"])
        official_functions = {}
        for pham in phams:
            functions_pham = list(df_genes[df_genes["Pham"] == pham]["Function"])
            for func in functions_pham:
                if func in approved_functions:
                    if func not in official_functions:
                        official_functions[func] = 1
                    else:
                        official_functions[func] += 1
        if len(official_functions)!= 0:
            print("GIVEN:",target_function," --- ", count,"\n")
            j = 1
            choices = {-1:"skip"}
            print(-1,"---- No visible choice")
            for i in sorted(official_functions.items(), key=lambda x: x[1], reverse=True):
                print(j,"--- Function:",i[0], "--- Frequency:", i[1])
                choices[j] = i[0]
                j+=1
            x = input()
            if int(x) != -1:
                conversion_table[target_function] = choices[int(x)]
            else:
                conversion_table[target_function] = -1

GIVEN: dna replication protein  ---  3 

-1 ---- No visible choice
1 --- Function: dna primase --- Frequency: 527
-1
GIVEN: dna-directed rna polymerase subunit alpha  ---  3 

-1 ---- No visible choice
1 --- Function: rna polymerase sigma factor --- Frequency: 14
2 --- Function: helix-turn-helix dna binding domain --- Frequency: 6
3 --- Function: hnh endonuclease --- Frequency: 4
4 --- Function: serine integrase --- Frequency: 2
5 --- Function: reca-like dna recombinase --- Frequency: 2
-1
GIVEN: dna-q like exonuclease  ---  3 

-1 ---- No visible choice
1 --- Function: exonuclease --- Frequency: 36
2 --- Function: dnaq-like (dna polymerase iii subunit) --- Frequency: 12
3 --- Function: dna polymerase --- Frequency: 9
1
GIVEN: dnaj-like domain chaperonin  ---  3 

-1 ---- No visible choice
1 --- Function: chaperonin, dnaj-like --- Frequency: 12
1
GIVEN: dpra-like dna processing chain  ---  3 

-1 ---- No visible choice
1 --- Function: dpra-like dna processing chain a --- Frequency: 411

-1
GIVEN: putative aminotransferase  ---  3 

-1 ---- No visible choice
1 --- Function: acetyltransferase --- Frequency: 1
-1
GIVEN: putative atpase family protein  ---  3 

-1 ---- No visible choice
1 --- Function: aaa-atpase --- Frequency: 92
1
GIVEN: putative capsid decoration protein  ---  3 

-1 ---- No visible choice
1 --- Function: capsid decoration protein --- Frequency: 37
1
GIVEN: putative cutinase  ---  3 

-1 ---- No visible choice
1 --- Function: lysin b --- Frequency: 59
-1
GIVEN: putative deoxynucleotide monophosphate kinase  ---  3 

-1 ---- No visible choice
1 --- Function: deoxynucleoside monophosphate kinase --- Frequency: 264
2 --- Function: adenylate kinase --- Frequency: 12
1
GIVEN: putative dna methyltransferase  ---  3 

-1 ---- No visible choice
1 --- Function: dna methylase --- Frequency: 174
2 --- Function: methyltransferase --- Frequency: 25
3 --- Function: methylase --- Frequency: 1
2
GIVEN: putative dna polymerase iii alpha subunit  ---  3 

-1 ---- No vis

-1
GIVEN: atpase family protein  ---  2 

-1 ---- No visible choice
1 --- Function: aaa-atpase --- Frequency: 49
1
GIVEN: baseplate assembly protein, lysozyme domain  ---  2 

-1 ---- No visible choice
1 --- Function: baseplate wedge protein --- Frequency: 180
2 --- Function: minor tail protein --- Frequency: 2
3 --- Function: baseplate j protein --- Frequency: 2
-1
GIVEN: baseplate i protein  ---  2 

-1 ---- No visible choice
1 --- Function: minor tail protein --- Frequency: 142
2 --- Function: baseplate j protein --- Frequency: 2
2
GIVEN: calcineurin-like phosphoesterase family protein  ---  2 

-1 ---- No visible choice
1 --- Function: metallophosphoesterase --- Frequency: 457
2 --- Function: phosphoesterase --- Frequency: 71
3 --- Function: recombination directionality factor --- Frequency: 36
4 --- Function: mre11 double-strand break endo/exonuclease --- Frequency: 10
5 --- Function: hydrolase --- Frequency: 4
6 --- Function: esterase --- Frequency: 3
7 --- Function: dna binding 

1
GIVEN: exodeoxyribonuclease vii large subunit  ---  2 

-1 ---- No visible choice
1 --- Function: exonuclease --- Frequency: 8
1
GIVEN: fabg-like protein  ---  2 

-1 ---- No visible choice
1 --- Function: oxidoreductase --- Frequency: 16
2 --- Function: fabg-like reductase --- Frequency: 8
2
GIVEN: galactylsyl transferase  ---  2 

-1 ---- No visible choice
1 --- Function: glycosyltransferase --- Frequency: 120
2 --- Function: galactosyltransferase --- Frequency: 107
2
GIVEN: galctosyltransferase  ---  2 

-1 ---- No visible choice
1 --- Function: glycosyltransferase --- Frequency: 134
2 --- Function: galactosyltransferase --- Frequency: 114
2
GIVEN: giy-yig nuclease  ---  2 

-1 ---- No visible choice
1 --- Function: g-i-y y-i-g endonuclease --- Frequency: 24
2 --- Function: dna methylase --- Frequency: 1
3 --- Function: dna binding protein --- Frequency: 1
1
GIVEN: glutaredoxin-like protein  ---  2 

-1 ---- No visible choice
1 --- Function: nrdh-like glutaredoxin --- Frequency: 1

1
GIVEN: nicotinate ribsoyltransferase  ---  2 

-1 ---- No visible choice
1 --- Function: nicotinate ribosyltransferase --- Frequency: 132
2 --- Function: phosphoribosyl transferase --- Frequency: 2
1
GIVEN: nlp/p60-family domain protein  ---  2 

-1 ---- No visible choice
1 --- Function: minor tail protein --- Frequency: 236
2 --- Function: hydrolase --- Frequency: 2
3 --- Function: peptidase --- Frequency: 2
4 --- Function: lysin a --- Frequency: 2
-1
GIVEN: non-heme haloperoxidase  ---  2 

-1 ---- No visible choice
1 --- Function: hydrolase --- Frequency: 332
2 --- Function: esterase --- Frequency: 162
3 --- Function: acetyltransferase --- Frequency: 2
-1
GIVEN: nrdg  ---  2 

-1 ---- No visible choice
1 --- Function: ribonucleotide reductase --- Frequency: 58
-1
GIVEN: nrdh-redoxin  ---  2 

-1 ---- No visible choice
1 --- Function: nrdh-like glutaredoxin --- Frequency: 572
2 --- Function: glutaredoxin --- Frequency: 370
3 --- Function: thioredoxin --- Frequency: 92
4 --- Functio

1
GIVEN: putative t-fold protein  ---  2 

-1 ---- No visible choice
1 --- Function: gtp cyclohydrolase i --- Frequency: 34
2 --- Function: hydrolase --- Frequency: 1
-1
GIVEN: putative tail assembly/metal protease  ---  2 

-1 ---- No visible choice
1 --- Function: metalloprotease --- Frequency: 34
2 --- Function: capsid maturation protease --- Frequency: 8
3 --- Function: major capsid protein --- Frequency: 6
4 --- Function: protease --- Frequency: 6
5 --- Function: peptidase --- Frequency: 6
-1
GIVEN: putative tail chaperone  ---  2 

-1 ---- No visible choice
1 --- Function: tail assembly chaperone --- Frequency: 68
1
GIVEN: putative tail fiber  ---  2 

-1 ---- No visible choice
1 --- Function: capsid decoration protein --- Frequency: 17
-1
GIVEN: putative tail fibre protein  ---  2 

-1 ---- No visible choice
1 --- Function: minor tail protein --- Frequency: 4
-1
GIVEN: putative tail tape measure  ---  2 

-1 ---- No visible choice
1 --- Function: tape measure protein --- Frequen

KeyboardInterrupt: Interrupted by user

In [29]:
len(conversion_table)

623

In [30]:
a_file = open("new_conversion_table.pkl", "wb")
pickle.dump(conversion_table, a_file)
a_file.close()

In [None]:
synomyms = {"head":"capsid","connector":"adaptor", "single-stranded dna":"ssdna","endo":"endonuclease"}
unimportant_words = ["punative", "domain", "protien"]