# Using `SingleProteinSearch()`

Requires a neo4j databases to be running at `localhost:7687`

In [50]:
import os
import logging
from rich import inspect
# Sometimes the neo4j debug logger is quite verbose, so it's disabled for this example
logging.getLogger("neo4j").disabled = True

Tell socialgene where it can talk to the neo4j database

In [51]:
os.environ["NEO4J_URI"] = "bolt://localhost:7687"

Provide the path to the HMMs used to create the database

In [52]:
hmm_filepath="/home/chase/Documents/socialgene_data/mibig_antismash/socialgene_per_run/hmm_cache/socialgene_nr_hmms_file_1_of_1.hmm"

Import the required module

In [53]:
from socialgene.findmybgc.findmybgc import FindMyBGC

Create a SingleProteinSearch() object. Requires two inputs:

1) A protein sequence
2) Path to a file of HMMs

In [54]:
sps_object = FindMyBGC()
# pikAI from https://mibig.secondarymetabolites.org/repository/BGC0000094/index.html#r1c1
sps_object.input_sg_object.add_protein(sequence = "MSSAGITRTGARTPVTGRGAAAWDTGEVRVRRGLPPAGPDHAEHSFSRAPTGDVRAELIRGEMSTVSKSESEEFVSVSNDAGSAHGTAEPVAVVGISCRVPGARDPREFWELLAAGGQAVTDVPADRWNAGDFYDPDRSAPGRSNSRWGGFIEDVDRFDAAFFGISPREAAEMDPQQRLALELGWEALERAGIDPSSLTGTRTGVFAGAIWDDYATLKHRQGGAAITPHTVTGLHRGIIANRLSYTLGLRGPSMVVDSGQSSSLVAVHLACESLRRGESELALAGGVSLNLVPDSIIGASKFGGLSPDGRAYTFDARANGYVRGEGGGFVVLKRLSRAVADGDPVLAVIRGSAVNNGGAAQGMTTPDAQAQEAVLREAHERAGTAPADVRYVELHGTGTPVGDPIEAAALGAALGTGRPAGQPLLVGSVKTNIGHLEGAAGIAGLIKAVLAVRGRALPASLNYETPNPAIPFEELNLRVNTEYLPWEPEHDGQRMVVGVSSFGMGGTNAHVVLEEAPGGCRGASVVESTVGGSAVGGGVVPWVVSAKSAAALDAQIERLAAFASRDRTDGVDAGAVDAGAVDAGAVARVLAGGRAQFEHRAVVVGSGPDDLAAALAAPEGLVRGVASGVGRVAFVFPGQGTQWAGMGAELLDSSAVFAAAMAECEAALSPYVDWSLEAVVRQAPGAPTLERVDVVQPVTFAVMVSLARVWQHHGVTPQAVVGHSQGEIAAAYVAGALSLDDAARVVTLRSKSIAAHLAGKGGMLSLALSEDAVLERLAGFDGLSVAAVNGPTATVVSGDPVQIEELARACEADGVRARVIPVDYASHSRQVEIIESELAEVLAGLSPQAPRVPFFSTLEGAWITEPVLDGGYWYRNLRHRVGFAPAVETLATDEGFTHFVEVSAHPVLTMALPGTVTGLATLRRDNGGQDRLVASLAEAWANGLAVDWSPLLPSATGHHSDLPTYAFQTERHWLGEIEALAPAGEPAVQPAVLRTEAAEPAELDRDEQLRVILDKVRAQTAQVLGYATGGQIEVDRTFREAGCTSLTGVDLRNRINAAFGVRMAPSMIFDFPTPEALAEQLLLVVHGEAAANPAGAEPAPVAAAGAVDEPVAIVGMACRLPGGVASPEDLWRLVAGGGDAISEFPQDRGWDVEGLYHPDPEHPGTSYVRQGGFIENVAGFDAAFFGISPREALAMDPQQRLLLETSWEAVEDAGIDPTSLRGRQVGVFTGAMTHEYGPSLRDGGEGLDGYLLTGNTASVMSGRVSYTLGLEGPALTVDTACSSSLVALHLAVQALRKGEVDMALAGGVAVMPTPGMFVEFSRQRGLAGDGRSKAFAASADGTSWSEGVGVLLVERLSDARRNGHQVLAVVRGSALNQDGASNGLTAPNGPSQQRVIRRALADARLTTSDVDVVEAHGTGTRLGDPIEAQALIATYGQGRDDEQPLRLGSLKSNIGHTQAAAGVSGVIKMVQAMRHGLLPKTLHVDEPSDQIDWSAGAVELLTEAVDWPEKQDGGLRRAAVSSFGISGTNAHVVLEEAPVVVEGASVVEPSVGGSAVGGGVTPWVVSAKSAAALDAQIERLAAFASRDRTDDADAGAVDAGAVAHVLADGRAQFEHRAVALGAGADDLVQALADPDGLIRGTASGVGRVAFVFPGQGTQWAGMGAELLDSSAVFAAAMAECEAALSPYVDWSLEAVVRQAPGAPTLERVDVVQPVTFAVMVSLARVWQHHGVTPQAVVGHSQGEIAAAYVAGALPLDDAARVVTLRSKSIAAHLAGKGGMLSLALNEDAVLERLSDFDGLSVAAVNGPTATVVSGDPVQIEELAQACKADGFRARIIPVDYASHSRQVEIIESELAQVLAGLSPQAPRVPFFSTLEGTWITEPVLDGTYWYRNLRHRVGFAPAIETLAVDEGFTHFVEVSAHPVLTMTLPETVTGLGTLRREQGGQERLVTSLAEAWVNGLPVAWTSLLPATASRPGLPTYAFQAERYWLENTPAALATGDDWRYRIDWKRLPAAEGSERTGLSGRWLAVTPEDHSAQAAAVLTALVDAGAKVEVLTAGADDDREALAARLTALTTGDGFTGVVSLLDGLVPQVAWVQALGDAGIKAPLWSVTQGAVSVGRLDTPADPDRAMLWGLGRVVALEHPERWAGLVDLPAQPDAAALAHLVTALSGATGEDQIAIRTTGLHARRLARAPLHGRRPTRDWQPHGTVLITGGTGALGSHAARWMAHHGAEHLLLVSRSGEQAPGATQLTAELTASGARVTIAACDVADPHAMRTLLDAIPAETPLTAVVHTAGALDDGIVDTLTAEQVRRAHRAKAVGASVLDELTRDLDLDAFVLFSSVSSTLGIPGQGNYAPHNAYLDALAARRRATGRSAVSVAWGPWDGGGMAAGDGVAERLRNHGVPGMDPELALAALESALGRDETAITVADIDWDRFYLAYSSGRPQPLVEELPEVRRIIDARDSATSGQGGSSAQGANPLAERLAAAAPGERTEILLGLVRAQAAAVLRMRSPEDVAADRAFKDIGFDSLAGVELRNRLTRATGLQLPATLVFDHPTPLALVSLLRSEFLGDEETADARRSAALPATVGAGAGAGAGTDADDDPIAIVAMSCRYPGDIRSPEDLWRMLSEGGEGITPFPTDRGWDLDGLYDADPDALGRAYVREGGFLHDAAEFDAEFFGVSPREALAMDPQQRMLLTTSWEAFERAGIEPASLRGSSTGVFIGLSYQDYAARVPNAPRGVEGYLLTGSTPSVASGRIAYTFGLEGPATTVDTACSSSLTALHLAVRALRSGECTMALAGGVAMMATPHMFVEFSRQRALAPDGRSKAFSADADGFGAAEGVGLLLVERLSDARRNGHPVLAVVRGTAVNQDGASNGLTAPNGPSQQRVIRQALADARLAPGDIDAVETHGTGTSLGDPIEAQGLQATYGKERPAERPLAIGSVKSNIGHTQAAAGAAGIIKMVLAMRHGTLPKTLHADEPSPHVDWANSGLALVTEPIDWPAGTGPRRAAVSSFGISGTNAHVVLEQAPDAAGEVLGADEVPEVSETVAMAGTAGTSEVAEGSEASEAPAAPGSREASLPGHLPWVLSAKDEQSLRGQAAALHAWLSEPAADLSDADGPARLRDVGYTLATSRTAFAHRAAVTAADRDGFLDGLATLAQGGTSAHVHLDTARDGTTAFLFTGQGSQRPGAGRELYDRHPVFARALDEICAHLDGHLELPLLDVMFAAEGSAEAALLDETRYTQCALFALEVALFRLVESWGMRPAALLGHSVGEIAAAHVAGVFSLADAARLVAARGRLMQELPAGGAMLAVQAAEDEIRVWLETEERYAGRLDVAAVNGPEAAVLSGDADAAREAEAYWSGLGRRTRALRVSHAFHSAHMDGMLDGFRAVLETVEFRRPSLTVVSNVTGLAAGPDDLCDPEYWVRHVRGTVRFLDGVRVLRDLGVRTCLELGPDGVLTAMAADGLADTPADSAAGSPVGSPAGSPADSAAGALRPRPLLVALLRRKRSETETVADALGRAHAHGTGPDWHAWFAGSGAHRVDLPTYSFRRDRYWLDAPAADTAVDTAGLGLGTADHPLLGAVVSLPDRDGLLLTGRLSLRTHPWLADHAVLGSVLLPGAAMVELAAHAAESAGLRDVRELTLLEPLVLPEHGGVELRVTVGAPAGEPGGESAGDGARPVSLHSRLADAPAGTAWSCHATGLLATDRPELPVAPDRAAMWPPQGAEEVPLDGLYERLDGNGLAFGPLFQGLNAVWRYEGEVFADIALPATTNATAPATANGGGSAAAAPYGIHPALLDASLHAIAVGGLVDEPELVRVPFHWSGVTVHAAGAAAARVRLASAGTDAVSLSLTDGEGRPLVSVERLTLRPVTADQAAASRVGGLMHRVAWRPYALASSGEQDPHATSYGPTAVLGKDELKVAAALESAGVEVGLYPDLAALSQDVAAGAPAPRTVLAPLPAGPADGGAEGVRGTVARTLELLQAWLADEHLAGTRLLLVTRGAVRDPEGSGADDGGEDLSHAAAWGLVRTAQTENPGRFGLLDLADDASSYRTLPSVLSDAGLRDEPQLALHDGTIRLARLASVRPETGTAAPALAPEGTVLLTGGTGGLGGLVARHVVGEWGVRRLLLVSRRGTDAPGADELVHELEALGADVSVAACDVADREALTAVLDAIPAEHPLTAVVHTAGVLSDGTLPSMTTEDVEHVLRPKVDAAFLLDELTSTPAYDLAAFVMFSSAAAVFGGAGQGAYAAANATLDALAWRRRAAGLPALSLGWGLWAETSGMTGELGQADLRRMSRAGIGGISDAEGIALLDAALRDDRHPVLLPLRLDAAGLRDAAGNDPAGIPALFRDVVGARTVRARPSAASASTTAGTAGTPGTADGAAETAAVTLADRAATVDGPARQRLLLEFVVGEVAEVLGHARGHRIDAERGFLDLGFDSLTAVELRNRLNSAGGLALPATLVFDHPSPAALASHLDAELPRGASDQDGAGNRNGNENGTTASRSTAETDALLAQLTRLEGALVLTGLSDAPGSEEVLEHLRSLRSMVTGETGTGTASGAPDGAGSGAEDRPWAAGDGAGGGSEDGAGVPDFMNASAEELFGLLDQDPSTD")
sps_object.annotate_input(use_neo4j_precalc = False, hmm_filepath=hmm_filepath)

First 5 HMM hash-ids:

In [55]:
inspect(sps_object.input_sg_object.proteins["l0XEibBqir1EE_0-MYQGaxObU8G5_xrk"].domains)

In [56]:
inspect(sps_object.input_sg_object.proteins["l0XEibBqir1EE_0-MYQGaxObU8G5_xrk"].domains[0])

Search Neo4j for proteins containing those HMMs

In [58]:
sps_object.query_neo4j_for_related_proteins()

  self.driver.verify_connectivity()


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'gds.similarity.jaccard' (line 14, column 27 (offset: 860))
"WITH `input_protein_hash` AS `input_protein_hash`, `prot1` AS `prot1`"
                           ^}

Get the HMMs of those proteins (TODO: should be done with previous step)
Yeah, TODO: this step also computes all pairwise comparisons between the input protein HMMs and the previously found proteins

In [None]:
sps_object.query_neo4j_get_hmms_given_protein_ids()

AttributeError: 'SingleProteinSearch' object has no attribute 'query_neo4j_get_hmms_given_protein_ids'

Take the results, find the genomic locations and species info for each found protein, and sort by "mod_score"

In [None]:
sps_object.sg_object.protein_comparison_append_species_and_assemblies()
sps_object.sg_object.protein_comparison_sort()
sps_object.sg_object.protein_comparison

: 