diff --git a/sourmash_lib/test-data/benchmark.dna.sig b/sourmash_lib/test-data/benchmark.dna.sig new file mode 100644 index 000000000..0b1fa4b16 --- /dev/null +++ b/sourmash_lib/test-data/benchmark.dna.sig @@ -0,0 +1,137 @@ +class: sourmash_signature +email: '' +hash_function: 0.murmur64 +name: gi|556503834:337-2799 Escherichia coli str. K-12 substr. MG1655, complete genome +signatures: +- ksize: 21 + md5sum: 3b3f65dba0dad5429f45e7023e269d7c + mins: [4519324738618364, 5243256295096602, 15176520465972589, 15680769339693652, + 18497857373523252, 18755330737671003, 21624573369793591, 23062839395285483, 25725997040217348, + 34666218892018070, 55529519525316327, 63656212937639342, 75298225208683694, 80926437301475488, + 82402909138949714, 84251831438776403, 89592827748722822, 96489177501377939, 107016255164726803, + 110175257989954062, 117055601844984303, 119550996164252532, 120076047914471482, + 124833414081263892, 132504286561547362, 139936877543357875, 142399575232998988, + 156589708717876073, 157600036558613226, 170884286104686349, 180176045311510443, + 183173484525153304, 183191513011246829, 190851827918094774, 196816659105826119, + 200502895665513902, 206617899443568343, 208648556119029728, 214757880775076502, + 221439947106598531, 222377104477853357, 229634467206794208, 229858927555422443, + 230403662973256315, 237754028275169642, 249662518251433611, 252081069902523525, + 252795710958968226, 256338218563006981, 258290322120847078, 296740190453303443, + 299105143441753695, 312212126552623732, 314500313110380519, 338527558449223148, + 339105017353077094, 359206500033710227, 365718153590186420, 370930398442177501, + 403815396160101602, 414182643590260521, 415731708245736342, 419730058340927733, + 431255193450726462, 449959775652570015, 455665465786325870, 456545671551732504, + 456749481847923790, 460465271562564119, 464691993746954567, 471969973719897424, + 473292825271979324, 473947444832446512, 503900581727712533, 504893110543338426, + 516254700454249722, 517558974040154693, 523042158632644287, 545925973186494361, + 573089772132552719, 573224337350869238, 586237090548481729, 588355878869538600, + 596427876319044649, 598478531121293759, 598766489696012982, 599121980285621763, + 603345455278234761, 604332158663999379, 605161394232150749, 627953349748057426, + 628288815373196979, 631467831068356910, 638930121032982754, 646268535348523426, + 648766646957260149, 651957585584691918, 653734716277023714, 671176783675442398, + 676449899983361696, 678087622994377643, 681209191264591886, 684442631447027859, + 685245532727773945, 686539290304492030, 701759044462468520, 701934435706719454, + 715706916902914970, 729109484741235585, 730252970558480668, 732263750898555337, + 734986380512471581, 736361759870714723, 739966764979519208, 742537096162251665, + 754742552548241672, 763733017463572138, 777085581844833631, 777453786599457901, + 778458373774438377, 789327661010622672, 793256482262031210, 796573987605825156, + 798410089280959798, 800975640267829726, 809713169356379329, 818462572733457560, + 834457982392276800, 836147994239719748, 848246813795393775, 848387196404680558, + 851455833321420577, 867503340479804435, 886670143014542260, 900865133144246679, + 920025939482583144, 922015725707714710, 923708994455073967, 923982578113854156, + 925099719285689242, 933319151978708573, 934263259985171652, 948528561267794041, + 959782827337428307, 980807913068975752, 984838048286703305, 984881157805812413, + 989280030569627198, 993660021418010680, 995298495947683416, 1026357438610600699, + 1026722615218255446, 1030331839741280415, 1031921553827356732, 1039994399985206279, + 1052972869877817167, 1077381073158598057, 1079047609887446506, 1086012621410603330, + 1092338124404754997, 1112394971301281651, 1117021080640735011, 1122162917877620266, + 1126566809884012978, 1129369092101879926, 1129433785712834018, 1145419176368039314, + 1153868899172633617, 1159980668304032197, 1160481518917506640, 1160512527053304343, + 1161440558231456077, 1168069068490863968, 1173220178183773678, 1182258147703991755, + 1185994171719370556, 1189242596262087385, 1199717730168357222, 1211779716367591027, + 1216201133346450995, 1229679666622766367, 1230656182628890758, 1233101334762226954, + 1236354072127266198, 1243074495767543813, 1250949996412780851, 1259049375998863105, + 1268020944079106958, 1286560352906372869, 1291261055811158406, 1299151697678883820, + 1313774236907997207, 1313834872258564977, 1318142498175019026, 1336629913601611224, + 1337290337233910885, 1342031747028108183, 1360788566971668171, 1360823748991542733, + 1366745720828760834, 1367147206171898204, 1370259284900334087, 1391302969262236457, + 1400304515373525090, 1404898683857088883, 1406201825417622770, 1417608977539552916, + 1419999067354124723, 1427543737893659597, 1431352449477640883, 1432621209833260478, + 1436585481635219363, 1450386337653025343, 1451885381036080122, 1453945751327348380, + 1453948733063214166, 1465221936850960950, 1472164335029945743, 1475828201299850435, + 1485152019595080685, 1492002816348121811, 1497806259765365076, 1501529834913216072, + 1503557462202075564, 1510837352866191218, 1512642141967435480, 1532450820489707644, + 1536357234676705810, 1536523421102184370, 1548963393451603019, 1581543022901963402, + 1632981599015678946, 1637817473292590359, 1638901620191724364, 1650197718545038690, + 1657066856918059654, 1661205611933086711, 1678149397196678002, 1683974340679182183, + 1686068793380065392, 1687691611522851499, 1691889127065138930, 1692273012421555454, + 1713216530254314524, 1718903384853258023, 1720385054113819339, 1730740015476700529, + 1733968174857035959, 1758778781373204159, 1768140986650268438, 1771350911157917208, + 1774013696963221559, 1782219010572055917, 1788767327145849387, 1789938326448838642, + 1792421215274753274, 1795657369358693420, 1810819993167968413, 1816404656173349530, + 1817055066496933728, 1820568892276037836, 1822962071471400097, 1825639555904054811, + 1834053593334762228, 1837129341233539103, 1870937570599479553, 1872129592214099523, + 1884339369858065032, 1893820897613243295, 1896232822216098555, 1906516256210041402, + 1915315830121528302, 1920013365529191181, 1930723238411242399, 1930876119636722515, + 1934093046569151778, 1936342417335652928, 1942727491988570656, 1961443182263664121, + 1970163189183629183, 1980227172794045180, 1983067792340983364, 1991226619193590967, + 1991757372586677703, 1994846379492089436, 1995397558673942110, 1998399501406778405, + 2010774597913166049, 2023664635576891030, 2026143885654018577, 2030337421815491074, + 2037951508415193077, 2053583989650168866, 2054292000947418619, 2058741158204567239, + 2071695137931598692, 2078622190162551156, 2084410813573936943, 2090929346261178679, + 2093937992541224605, 2107642136893914209, 2107802078510491904, 2119447471755743701, + 2126517280203148378, 2130598817322888508, 2131159775922834200, 2144290045741408074, + 2146550799732142880, 2174688491616269385, 2182350378775642431, 2182495780395207238, + 2208728934520504861, 2213063025125587798, 2216045211154768385, 2224959614675946807, + 2230462238420622356, 2258381747443868073, 2263796152233777888, 2278505586625664696, + 2283065431645155418, 2284441040843337957, 2286648900818241539, 2295448339139811872, + 2297526010900547541, 2299823172028285892, 2304554498529584840, 2332625609224629147, + 2339375756765724029, 2347491726430767933, 2348838375021109037, 2355840300283764012, + 2365452663940516736, 2378457784354669085, 2378516266347023145, 2384891006616031939, + 2388019106715793809, 2398473838049688103, 2403334196097591244, 2409882547993093130, + 2428072231424585984, 2428978814828024367, 2438570578138481180, 2440354108354493663, + 2449325024734085858, 2451665978596034090, 2457715532828910646, 2457991397581156850, + 2461594951732363213, 2474997665650744564, 2475869000158010761, 2504767299873430452, + 2508919575429465809, 2510268831944191490, 2553262946850694743, 2553554258639171396, + 2558022526516252085, 2558190745137184689, 2560097132748291323, 2574742088441021427, + 2577436160301605322, 2580810685249191212, 2581436868379296736, 2582843240745954311, + 2597446662573902694, 2617949472256625355, 2647764891290911793, 2655503369095476012, + 2659756106331503134, 2659847566533896528, 2662764696090442853, 2670028013069951546, + 2671437285399770849, 2686339012258676821, 2693432221956163444, 2695076334640323459, + 2700230503858942259, 2702170362149250821, 2722618233472513355, 2735262227501225629, + 2735399790361765204, 2736401556206271574, 2737523794693703117, 2742696243539725477, + 2742725323803853318, 2750342749205377626, 2750962880808453593, 2767211794175800290, + 2770492724049357466, 2773698349126661089, 2778259439953088600, 2778536944289919472, + 2781055436339043258, 2784931508320045865, 2786068262360282733, 2788402065397740995, + 2802718305571670043, 2803688320404732440, 2805699247995141926, 2820871697800450499, + 2838638486493829008, 2839081500525257566, 2857684846089530341, 2869214032260177002, + 2897250615174470364, 2898611078401408876, 2907297808381692787, 2911826942737329011, + 2912399171850959001, 2917927014905288691, 2920318468275464055, 2928442303188940546, + 2938663159845756313, 2952961813886494608, 2957504796722892583, 2969855958158578090, + 2975008837852664025, 2979748479387304108, 2991108502810333073, 3001493793227284979, + 3006033081391935982, 3024430717092521848, 3039608360044500288, 3041396875938219007, + 3042780289115984639, 3042824712400253071, 3052059856694997649, 3057247052038313565, + 3058556019203661583, 3063227788524327270, 3072447605791036746, 3077216324837998741, + 3083704535159701417, 3094068587744684348, 3104465009534030626, 3109969722608734588, + 3132232315277986023, 3132251615142410129, 3137360231796404095, 3137866721725159653, + 3142530568901831050, 3143098026980194232, 3145357567219202415, 3147447324507890102, + 3151208410561986743, 3153497729307887725, 3158086840306995462, 3168182845767792139, + 3171311335767071310, 3172196483357758834, 3175770247360822520, 3176291981415390553, + 3188967604342996040, 3204197026109730228, 3221232622671857919, 3231261365601695509, + 3242574103497111853, 3243086161551174470, 3249549060036738262, 3257485475186759484, + 3270771298921183734, 3271612627264331837, 3300146850093170921, 3310978389214829867, + 3317803713725542045, 3326237421573977254, 3342090293414468725, 3350878148353134881, + 3352651877374063578, 3358448729304128201, 3364218462146205870, 3364473376994669079, + 3388562240502716743, 3389915034257361798, 3392673999750762566, 3406760409136676844, + 3408073641908567893, 3412407728887187912, 3424289220282056215, 3432226410480394041, + 3445682647990091419, 3446456007295741065, 3446548541137504392, 3451658369223712908, + 3456156073753649946, 3460410899898532804, 3461253244569955031, 3475054620636748053, + 3477865973545127599, 3489035233087890043, 3495678376357637248, 3498724394328816097, + 3507337921191423932, 3514908858619879004, 3515615531953951985, 3523557567378948344, + 3527861831568843924, 3528387260590764442, 3540378653931509135, 3544374012788067253, + 3550178620588827665] + num: 500 +type: mrnaseq +version: 0.4 +--- + diff --git a/sourmash_lib/test-data/benchmark.input_prot.sig b/sourmash_lib/test-data/benchmark.input_prot.sig new file mode 100644 index 000000000..db14d0c44 --- /dev/null +++ b/sourmash_lib/test-data/benchmark.input_prot.sig @@ -0,0 +1,139 @@ +class: sourmash_signature +email: '' +hash_function: 0.murmur64 +name: NP_414543.1 Bifunctional aspartokinase/homoserine dehydrogenase 1 [Escherichia + coli str. K-12 substr. MG1655] +signatures: +- ksize: 21 + md5sum: 24edfd6d0f64fbf42dbccb93c8c134f7 + mins: [10682905355934881, 16150794695267721, 17753254750845259, 30600215099479913, + 37355819167244906, 99117500080430131, 148742487324602500, 156452886777712559, + 162993231509285979, 177156339781637304, 199644633977465045, 248660297482490269, + 275524170355095580, 306279910562423717, 317824467826482715, 334146531533784392, + 338954598756989975, 371237943370537022, 461272048108075377, 509559073518577653, + 514964961279308538, 555409858638553730, 559012644985242630, 564629042853406415, + 578389020288120519, 580927968176440031, 640685175721760486, 647257381747169744, + 667269471732250243, 679757486152190992, 714115706385378738, 714179555958514242, + 774823265519756860, 802266878492282344, 813357472592692718, 832075088954656659, + 832477166973914780, 912451346402178396, 914080979894810649, 933515068323871437, + 951668928585067579, 966178457477709088, 995689886798495593, 1053693672418204958, + 1091013499526403368, 1179328976221969524, 1184517288514596795, 1186189957295452652, + 1227847205811126400, 1229029543460938479, 1255420948030336233, 1283895421556384617, + 1296935676789255939, 1302087720582233216, 1317918508957148169, 1326601499826900251, + 1355749477150217269, 1361798894306609982, 1376724123239797770, 1414348706405148746, + 1442695857065839701, 1450660084951753509, 1451090465693166715, 1471923799250968368, + 1486070334767516997, 1516969265982372724, 1547054723527126900, 1584992181680204639, + 1622963802471222395, 1631266615519568758, 1674136429647052301, 1691309820217597611, + 1696562844328366695, 1708030168090250658, 1717819643912594421, 1743589029305956870, + 1771655412601674994, 1778278400935886558, 1784129938737364597, 1816477401854898339, + 1830197349432806211, 1835574158361255715, 1840139141309112121, 1872819847089977069, + 1928830113603677194, 1942655715581533824, 1958274760512308516, 1966174586242755824, + 1983997344318358229, 2002869112004408829, 2052190570375234917, 2088848217688857006, + 2092465887645141157, 2100937241927186307, 2101278828258474053, 2104649593316675444, + 2127865378022778058, 2127904763801419130, 2146522062132315478, 2160018687318625174, + 2177724862156869064, 2197287409672687767, 2210360403395154568, 2220227813113749289, + 2223511396504267252, 2231035703519480707, 2245438164895505684, 2248165019026967227, + 2256139307671620478, 2259264852120975032, 2292840296171613936, 2313914626771657608, + 2320721340569174311, 2338075986182469252, 2360396866421391099, 2364640489573294744, + 2406758860365655434, 2423179013689189265, 2448737136244154411, 2469245351346833774, + 2490585878833786389, 2502246077591838540, 2503447407469093326, 2507865646425407636, + 2516277423064370213, 2544637260049948732, 2552152401669543747, 2573523216671730949, + 2637759166922663195, 2638961467562004302, 2651079224201793332, 2696839868650266664, + 2710438860000019483, 2715215810836295485, 2758295712746202773, 2815384402494189266, + 2841815180231149078, 2853598460744642604, 2856311587481226915, 2892546529257426503, + 2894645982995959252, 2904853741381490417, 2910316796615469870, 2924210133812825619, + 2943733023520325027, 2961026992157453852, 2998546266431629508, 3009523701523688283, + 3011472395057755731, 3051752022648099896, 3052278310187811484, 3077665012269213827, + 3099376943052401103, 3107519715256702866, 3121954577473086880, 3122519633641600227, + 3157925041194120293, 3210252803490660508, 3253393389305116451, 3259058747017660169, + 3273003052789038911, 3282559116083722539, 3307481318231602924, 3321721671798892010, + 3325140133014704960, 3341207037602999600, 3371843561886178694, 3430438264554867931, + 3459774423855211294, 3503461083668628471, 3504789141313687821, 3528904367971675805, + 3667622271919296765, 3714408163245540989, 3797496917690432950, 3842142973332268297, + 3853939819675085335, 3873571047980922540, 3885321966062409173, 3893490215851908667, + 3905666570210321004, 3915294668805145009, 3916696378515928959, 3933321036263514369, + 3964664411117053821, 3970765502178381733, 4074845057322737146, 4085844107482595459, + 4135680798979854223, 4157514075656323402, 4184496312029191046, 4187542175917561508, + 4208175361337775664, 4216723998438826837, 4217508190587322622, 4225492579266106220, + 4235156983960623358, 4263121036425740475, 4264044860627471418, 4277691404883774673, + 4290314971703863389, 4290522266987821833, 4354826747022013355, 4380862014083095651, + 4383263468962423950, 4426208588814664078, 4533140091375218183, 4555995448512426959, + 4580217109070879712, 4581891790542852588, 4615986015408079491, 4625950252622820102, + 4627880944776459138, 4640544205922069509, 4649657587230910416, 4652293753008361171, + 4682590855545968534, 4691442461607592571, 4691740469769901802, 4706380254589513682, + 4736813867721059110, 4756390378408707637, 4789501280323959463, 4813023417634069056, + 4834735296063819621, 4841024451746036657, 4844055552052425553, 4900671908521565097, + 4901872871373054594, 4907218629943135539, 4925586423423431735, 4939115087512982986, + 4958423229960100909, 4984397460057634624, 5039095838064931048, 5126632529463293243, + 5128464984899434530, 5136622863011569579, 5144527921885725955, 5168541985875311392, + 5172470802772231825, 5187250934587230851, 5253734691652424841, 5303989220674830051, + 5341301685774604065, 5349418547268253864, 5357295807597463188, 5365557023607835204, + 5384479667509934210, 5396307450147762014, 5420232484017582518, 5444547919488848630, + 5453154609874621884, 5478222244085633547, 5494113693325283887, 5495442915250229597, + 5498472762435894913, 5505873302741345056, 5506440284818710276, 5515388545713973274, + 5540210507889289427, 5554644998006503884, 5573063099667294633, 5585876533220473132, + 5603741897743235763, 5618640394893581406, 5629095139064129438, 5634823002358759158, + 5679551921699448084, 5759547545508790040, 5761918910641162438, 5771113446298577572, + 5788650952639148935, 5834908356155715069, 5881715436940831548, 5911899423563711717, + 5930845630573581870, 5954234669470430406, 5997702108378258810, 5999285198248577775, + 6011833446286024380, 6021675575123244533, 6045764855194090223, 6064611246722735205, + 6088982971846465839, 6184153387854396063, 6185567134918389168, 6189525558808830735, + 6240524902361544331, 6276381353721310608, 6312106194357626407, 6326730850981182171, + 6388803301528348089, 6398872565513096530, 6416029515770779849, 6422673840825994722, + 6455982297426559802, 6473187733945823449, 6519446247473132996, 6531104990266858446, + 6535455114807439429, 6561206837946627555, 6596545665778155295, 6613776294157801448, + 6639145899173784138, 6672809082272968660, 6678504039954018896, 6708256449228333129, + 6716161533038028074, 6722442219712337699, 6736922461561653881, 6741596921472276934, + 6817906399157341056, 6846267181638332537, 6856636408219833856, 6868094984889238653, + 6870842072963411977, 6871860687264191532, 6872351760937902499, 6882678219914848575, + 6898205070188796865, 6912581032040637274, 6924759048391579808, 6927329647982452587, + 6988428322282886496, 7104994969433370604, 7116687700559937402, 7129907715306245071, + 7136261932605298047, 7141543599901716499, 7174681492424595912, 7178492862712053782, + 7214318729724837022, 7227456528619666134, 7235946843860091794, 7280969018330879211, + 7284060732363196020, 7298968912864322874, 7310334837893458717, 7337769637189750102, + 7368327414875420580, 7406117571455886760, 7443740128134257237, 7462308476218008663, + 7465895078159000181, 7467049778016050261, 7523762790460175354, 7528523012183441586, + 7582930159101023118, 7599351062828362883, 7621036739567331536, 7621873726013925648, + 7632488240078199503, 7643211121595033053, 7648817208214795141, 7675726127750291040, + 7678787808093027697, 7680138541566608678, 7683316007508743225, 7721093125821650335, + 7765393596362526500, 7773228240684509978, 7790522190523653015, 7797517208235014758, + 7825274354998754442, 7853552151717946294, 7868054814615925352, 7870519410913985110, + 7876455960893299504, 7944819911154275220, 7955274410962036180, 7982311525022890678, + 8005509899444692250, 8049200313090271204, 8066463256648704249, 8085241364297325811, + 8114293520720617818, 8114686021388301174, 8132482790631881290, 8133299828995226495, + 8193016553910765619, 8196459004927330156, 8228338824262146664, 8238335396100987360, + 8254520792823728383, 8267508385806720048, 8282682190643351649, 8308279964916478008, + 8379318025547649498, 8403490400769311159, 8416851901985843499, 8417363070153515180, + 8453560107119619057, 8497257221538987762, 8531308738386601760, 8549242562830344777, + 8579345429011704328, 8596561545926806237, 8625227684192126734, 8650701470926778333, + 8728796141572031261, 8731279620926496026, 8751627774811922702, 8758722957073166232, + 8776214222143903128, 8815284774457673906, 8827804030304790004, 8839249076508655955, + 8906066004466986695, 8923766898874241857, 8944225775389112781, 9022933641087310127, + 9077834736006363286, 9136133449756499376, 9139370841846854606, 9143890121108219909, + 9146742276846680925, 9196458087337652127, 9215306604237959443, 9229159508317029608, + 9249461362443313590, 9256205793144786034, 9270931489203274310, 9273523738422105681, + 9275341094120503612, 9276841587988795924, 9314431608357750327, 9317981569306923934, + 9351195765462630186, 9359040102657841760, 9360254905577931110, 9396720477031063647, + 9414813095083254288, 9419340602259326371, 9426590420986595133, 9446518403944637633, + 9449036294393201616, 9464391126292190460, 9465540378209144842, 9530316263968071769, + 9577403252477125595, 9578228711375383441, 9593634519537772852, 9651644386104343220, + 9666659681191317939, 9679623353040886628, 9706768900736517862, 9755416314182142925, + 9755824638877304590, 9780885421811977349, 9814139178364589153, 9861218950482175086, + 9925096123474164619, 9925823550962345642, 9935190611498362552, 9962799832056714067, + 9978056796243419534, 9993045118210673261, 10033732379317957552, 10041590538664485448, + 10053251600321106315, 10064684699654777413, 10065942241274461421, 10069671417576739906, + 10098458141873339528, 10136607888794848211, 10161178784823577580, 10180209104701716371, + 10247352081644971210, 10253013902823404118, 10255332839713555453, 10265902899367828233, + 10271418788612591002, 10277532139121477345, 10293112315539013732, 10314673237547032458, + 10315896321381154606, 10331383062078191090, 10336074876329119286, 10351379625690399947, + 10424991294641334734, 10426986344957878387, 10443967685902779035, 10449207970243068709, + 10455760882159158987, 10483261648484642302, 10493096184640318205, 10509004757430497704, + 10522055032882165009, 10531172354070436133, 10535130355543259640, 10542966685829255665, + 10604843523450387477, 10616906715012981315, 10627672652373875951, 10646579636266397175, + 10662675238870589643, 10666054824191775732, 10676845122526014730, 10704087553262149467] + molecule: protein + num: 500 +type: mrnaseq +version: 0.4 +--- + diff --git a/sourmash_lib/test-data/benchmark.prot.sig b/sourmash_lib/test-data/benchmark.prot.sig new file mode 100644 index 000000000..a655117b0 --- /dev/null +++ b/sourmash_lib/test-data/benchmark.prot.sig @@ -0,0 +1,137 @@ +class: sourmash_signature +email: '' +hash_function: 0.murmur64 +name: gi|556503834:337-2799 Escherichia coli str. K-12 substr. MG1655, complete genome +signatures: +- ksize: 21 + md5sum: c567e8c26e673ec982a4939da58ffaa6 + mins: [7930275018672208, 9075874669311903, 10682905355934881, 16150794695267721, + 17547107236545012, 17640452842079392, 17753254750845259, 24086344812251351, 30600215099479913, + 37069982692728201, 37355819167244906, 38136249062647522, 48792208721158894, 51515867600534675, + 64121846258798986, 69272978335398059, 71447365833161612, 84697722076242271, 85226675969043304, + 86076375159296880, 87304621089807058, 87552204895837332, 91815393743222518, 98299384868090376, + 99117500080430131, 100005690218545377, 107242291917890717, 114332564217836134, + 115758415042321923, 130728973585524938, 136220243510272543, 139423480614605536, + 141299620217034366, 142659247104697677, 148742487324602500, 153398261005674271, + 153582898471423319, 153740319167598270, 156452886777712559, 159404793775705276, + 162993231509285979, 164530861997430387, 167223312653609338, 172449648878794955, + 174725370298669007, 177156339781637304, 180854471883513185, 182948694658587440, + 187931390792287220, 188657388719349073, 189203668757510815, 193427363112506194, + 199644633977465045, 209696514686008843, 210156161229902944, 210217965072815944, + 212546782500618980, 213300323685592848, 219640711827082781, 222563623307754875, + 230163658243834060, 243673009071578736, 244014645717890490, 244326451629595695, + 246691011532924839, 248660297482490269, 248803004283743663, 252060428502079956, + 257501498862743354, 265085958816863430, 273454530183615351, 274291630505652326, + 275524170355095580, 280219828303968928, 282568803879081068, 285231040798065803, + 286893707502532214, 294820520251591938, 296580323925665157, 301008597068707850, + 305208426903845864, 306279910562423717, 306373247270959975, 317824467826482715, + 320223347920727937, 321493068949469677, 334146531533784392, 335788862963809211, + 338954598756989975, 345017604344397019, 360631350052977373, 369089197074442757, + 371237943370537022, 388478704966647785, 409112837573878651, 434160138521587441, + 437465851955225980, 446478558209411406, 450374717104586842, 457369813187887010, + 457574891201280709, 459127354660680566, 459298506844071775, 459359342711225619, + 461272048108075377, 465342684731111830, 467071769628805980, 474710144514664437, + 478519259951586381, 487158213246049430, 494336819974385508, 498813370643510604, + 509559073518577653, 513753510299612674, 514964961279308538, 523731454045110256, + 525612175220213964, 531428121182662005, 532438308846828098, 541596844237363094, + 541903771940552871, 546914239787460255, 552072950795747859, 555409858638553730, + 557875218626339430, 559012644985242630, 564629042853406415, 571017737818732892, + 574382367180953132, 578011197651451314, 578389020288120519, 580927968176440031, + 610969912445563441, 614951108496454510, 616268149116846731, 617981880663082183, + 627319388772321463, 634615205592931243, 635739925927571886, 640685175721760486, + 644688007577121740, 647257381747169744, 647657131651535430, 650129014256298588, + 653527914527060701, 655353885707454966, 661351732088590511, 661439198567158899, + 663095478084650357, 664378362974868310, 664753616847925812, 664907100791330707, + 667269471732250243, 679262193175437130, 679757486152190992, 684345808023604480, + 687594031558610541, 693446288284341759, 693685333281246574, 694219704957144457, + 700906162253649173, 702340636008528479, 708787615898557518, 714115706385378738, + 714179555958514242, 717066141939919148, 717997401125922957, 731119513899372297, + 733236297169758665, 736389262353633879, 736897801154300745, 739848224364080826, + 745699530327920170, 748398049432157404, 748423366827825553, 750752256714461266, + 753860669367984676, 759013876860825975, 762541325007093834, 768544302932269467, + 769496630124015702, 774823265519756860, 775267558667932797, 778684908731145305, + 784663010464042672, 784830375424576970, 787634490082956219, 790343245846285981, + 802266878492282344, 813357472592692718, 819401314378961358, 822775846045504252, + 823091125492181447, 825302703243006423, 832075088954656659, 832477166973914780, + 835671135465233200, 837213868883701582, 840941746001279053, 844532823307214917, + 845183430099397111, 852908406506036645, 854319845067587352, 858088070267624393, + 861952355302862993, 865238026244482760, 865671781027662439, 872095281080796975, + 873863200607942491, 874718660088462986, 878205668413004813, 888236690830937992, + 890735580668930926, 893848437992939042, 895367907040252107, 898090261648046426, + 899322905470271421, 912451346402178396, 914080979894810649, 916066474244448952, + 919676067761027450, 920845663060947555, 922692784576881185, 923477021371880684, + 927781463654787788, 930279098891294778, 933515068323871437, 938888001844250670, + 945737001444486630, 951668928585067579, 952322578693013313, 953823420320576413, + 959845045799765234, 960769715277714890, 961052043857653833, 961055334761858887, + 961863742196027811, 965918454822341262, 966178457477709088, 966816163833510006, + 969256419536926103, 970242784469008018, 974236444389212521, 988000277524680681, + 993145613504368087, 995689886798495593, 997730867113778426, 1000651360734654728, + 1007796518391370345, 1017379829380910046, 1017625569006513008, 1019663095382838705, + 1020341232725799053, 1021356501812509000, 1023782547662502337, 1031213481269095776, + 1031485893867773446, 1033791049518572845, 1036000482160651075, 1036904520802410135, + 1049774441160092291, 1053693672418204958, 1055207377856037100, 1058922065923299210, + 1063279351662569135, 1068163740119232381, 1070810674783709304, 1074083801880988122, + 1074460933888705490, 1080621428616392430, 1083601088529209438, 1087938824524443703, + 1091013499526403368, 1097922429017537238, 1103067368153795190, 1104119206904782911, + 1108923567283337928, 1109085074190276484, 1119573159263373912, 1123271479243765954, + 1136189084055850542, 1138082755101186178, 1150920645445585469, 1155726786148959027, + 1162967605958108650, 1168967330079103636, 1171213291108713442, 1171235020946715474, + 1171297622155675411, 1173736031602702309, 1178096420910080805, 1179328976221969524, + 1180678142236757953, 1184086107588279992, 1184517288514596795, 1186189957295452652, + 1192580095698879005, 1201727091857946167, 1202884457448665653, 1205934721598649129, + 1208050132845156847, 1223261048069811460, 1225142549441816887, 1226621959644478872, + 1227847205811126400, 1229029543460938479, 1229542673361065943, 1231383667163032268, + 1232951987501541828, 1236166748036181894, 1239455382117678722, 1246125225592788963, + 1249337217116493887, 1253408437317614022, 1255420948030336233, 1264134924050284435, + 1265368711289617852, 1269876512869377641, 1272420822849893571, 1278390010866966594, + 1283895421556384617, 1287023759829985472, 1287494383473243370, 1290561839469361214, + 1296935676789255939, 1299774739101094458, 1302087720582233216, 1304554361404049807, + 1317918508957148169, 1318152567793019328, 1318451001303251171, 1318584536332394538, + 1322806625358602405, 1326601499826900251, 1328460050836389738, 1331140925254453872, + 1338236326350880512, 1343283840411093815, 1343993775955035925, 1353903699780815167, + 1355749477150217269, 1361798894306609982, 1362696591503028932, 1367587738422289145, + 1368877693252671231, 1376724123239797770, 1377497866413068319, 1380479285915617230, + 1405111287461283811, 1413925919445815562, 1414348706405148746, 1419471906664392979, + 1423593465512194553, 1425402697630341159, 1426492352232335912, 1427370622400232590, + 1440942592488124182, 1442665877876752012, 1442695857065839701, 1450250793339871951, + 1450660084951753509, 1451090465693166715, 1455230264688992031, 1457962912778788673, + 1463570968439213059, 1466787260573801999, 1471923799250968368, 1484194422532907633, + 1486070334767516997, 1489510473296108253, 1491477729393925591, 1491721797504416373, + 1491974436736024343, 1493615002909706310, 1495580586922233524, 1501737315607475847, + 1502555833035007937, 1505721636508143449, 1516969265982372724, 1518539287526864101, + 1523348621662415645, 1530479241075606611, 1535816983270442236, 1536064025206264829, + 1536441049178728569, 1536480424088365761, 1537361820865763291, 1541912665230852647, + 1542551006348033039, 1544939522422462064, 1547054723527126900, 1547104639155251003, + 1549062991711208478, 1556953999233205552, 1557429880601798031, 1562068153913511130, + 1564842859315667989, 1571609427611552814, 1572444396812775889, 1573252281972517468, + 1577329713929752959, 1578295166649786589, 1582311885861633278, 1584992181680204639, + 1585202802582949054, 1591598824000345826, 1591834498926280358, 1605038676808044086, + 1606903500708615174, 1610385274673544043, 1619008289754356641, 1622369398087891801, + 1622963802471222395, 1629132958847429415, 1629520886988487308, 1631266615519568758, + 1631405498241637938, 1635385886896492471, 1635534479137840166, 1638219935979085452, + 1641589709758615938, 1642998735747133682, 1656291265170637770, 1667865148272922146, + 1673216236812448808, 1674136429647052301, 1676120374315702638, 1677971948972977938, + 1678961278137565129, 1681587803218985506, 1691309820217597611, 1691464305686985269, + 1692903751113469813, 1696562844328366695, 1700308245307187515, 1703838023745664291, + 1705397137760768671, 1708030168090250658, 1710811828194744676, 1716908215538136219, + 1717819643912594421, 1720637010320377293, 1726944860583033672, 1730879700411978972, + 1731687599079095752, 1736216585571289129, 1737881241851299156, 1743589029305956870, + 1755048168471353146, 1757247487833520901, 1765636799815162689, 1767317456508383658, + 1769098531771118961, 1771655412601674994, 1775569209765601682, 1778278400935886558, + 1784129938737364597, 1798099240944700178, 1802837152077937248, 1803185195520887767, + 1808759982712059399, 1811024907616097743, 1813780555909033179, 1814800990757351932, + 1816477401854898339, 1818655136651526295, 1821482972855150524, 1830197349432806211, + 1835574158361255715, 1839789632490363928, 1840139141309112121, 1840853314953851001, + 1842923687549616332, 1861987392016553862, 1863215397271907868, 1872819847089977069, + 1877730191472146234, 1879766569694078825, 1885784418365677532, 1887491736765616376, + 1889581325281546723, 1892826395574424759, 1899850520181678491, 1905132713136897939, + 1915276928258674542, 1919753889956605473, 1919834814610232616, 1920258488147739839, + 1928794591850714246, 1928830113603677194, 1931512806935762710, 1932920981801307694, + 1934577446354899030, 1938113130056821616, 1942655715581533824, 1943542560170639744, + 1949049296707212374, 1950730322789272536, 1958274760512308516, 1959812964369970361] + molecule: protein + num: 500 +type: mrnaseq +version: 0.4 +--- + diff --git a/sourmash_lib/test_sourmash.py b/sourmash_lib/test_sourmash.py index 788c80325..9557c259a 100644 --- a/sourmash_lib/test_sourmash.py +++ b/sourmash_lib/test_sourmash.py @@ -282,6 +282,74 @@ def test_do_sourmash_check_protein_comparisons(): assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0 +def test_do_sourmash_check_knowngood_dna_comparisons(): + # this test checks against a known good signature calculated + # by utils/compute-dna-mh-another-way.py + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.genes.fna') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--singleton', '--dna', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.genes.fna.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1, sig2 = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.dna.sig') + good = list(signature.load_signatures(knowngood))[0] + + assert sig2.similarity(good) == 1.0 + + +def test_do_sourmash_check_knowngood_input_protein_comparisons(): + # this test checks against a known good signature calculated + # by utils/compute-input-prot-another-way.py + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.faa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--input-is-protein', + '--singleton', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.faa.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.input_prot.sig') + good_aa = list(signature.load_signatures(knowngood))[0] + + assert sig1_aa.similarity(good_aa) == 1.0 + + +def test_do_sourmash_check_knowngood_protein_comparisons(): + # this test checks against a known good signature calculated + # by utils/compute-prot-mh-another-way.py + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.genes.fna') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--singleton', '--protein', + '--no-dna', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.genes.fna.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.prot.sig') + good_trans = list(signature.load_signatures(knowngood))[0] + + assert sig2_trans.similarity(good_trans) == 1.0 + + def test_do_plot_comparison(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') diff --git a/utils/compute-dna-mh-another-way.py b/utils/compute-dna-mh-another-way.py new file mode 100755 index 000000000..e9a8d996e --- /dev/null +++ b/utils/compute-dna-mh-another-way.py @@ -0,0 +1,60 @@ +#! /usr/bin/env python +""" +Use the MurmurHash library mmh3 and separate Python code to calculate +a MinHash signature for input DNA sequence, as a way to do an +external check on our C++ implementation. + +The output of this is used in test_sourmash.py to verify our C++ code. +""" + +__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +def complement(s): + """ + Return complement of 's'. + """ + c = "".join(__complementTranslation[n] for n in s) + return c + + +def reverse(s): + """ + Return reverse of 's'. + """ + r = "".join(reversed(s)) + return r + + +def kmers(seq, k): + for start in range(len(seq) - k + 1): + yield seq[start:start + k] + +### + +K = 21 + +import sys, screed +import mmh3 +import sourmash_lib +print('imported sourmash:', sourmash_lib, file=sys.stderr) +from sourmash_lib import MinHash +import sourmash_lib.signature + +record = next(iter(screed.open(sys.argv[1]))) +print('loaded', record.name, file=sys.stderr) +revcomp = reverse(complement((record.sequence))) + +E = sourmash_lib.Estimators(ksize=K, n=500, protein=False) +mh = E.mh + +for fwd_kmer in kmers(record.sequence, K): + rev_kmer = reverse(complement(fwd_kmer)) + if fwd_kmer < rev_kmer: + kmer = fwd_kmer + else: + kmer = rev_kmer + + hash = mmh3.hash128(kmer, seed=42) + mh.add_hash(hash) + +s = sourmash_lib.signature.SourmashSignature('', E, name=record.name) +print(sourmash_lib.signature.save_signatures([s])) diff --git a/utils/compute-input-prot-another-way.py b/utils/compute-input-prot-another-way.py new file mode 100755 index 000000000..18282677c --- /dev/null +++ b/utils/compute-input-prot-another-way.py @@ -0,0 +1,87 @@ +#! /usr/bin/env python +""" +Use the MurmurHash library mmh3 and separate Python code to calculate +a MinHash signature for input protein sequence, as a way to do an +external check on our C++ implementation. + +The output of this is used in test_sourmash.py to verify our C++ code. +""" + +dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', + 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', + 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', + 'TGT':'C','TGC':'C', 'TGG':'W', + 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', + 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', + 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', + 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', + 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', + 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', + 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', + 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', + 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', + 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', + 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', + 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} + + +__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +def complement(s): + """ + Return complement of 's'. + """ + c = "".join(__complementTranslation[n] for n in s) + return c + + +def reverse(s): + """ + Return reverse of 's'. + """ + r = "".join(reversed(s)) + return r + + +def peptides(seq, start): + for i in range(start, len(seq), 3): + yield dna_to_aa.get(seq[i:i+3], "X") + + +def translate(seq): + for i in range(3): + pep = peptides(seq, i) + yield "".join(pep) + + revcomp = reverse(complement((seq))) + for i in range(3): + pep = peptides(revcomp, i) + yield "".join(pep) + +def kmers(seq, k): + for start in range(len(seq) - k + 1): + yield seq[start:start + k] + +### + +K = 21 + +import sys, screed +import mmh3 +import sourmash_lib +print('imported sourmash:', sourmash_lib, file=sys.stderr) +from sourmash_lib import MinHash +import sourmash_lib.signature + +record = next(iter(screed.open(sys.argv[1]))) +print('loaded', record.name, file=sys.stderr) + +E = sourmash_lib.Estimators(ksize=K, n=500, protein=True) +prot_ksize = int(K / 3) +mh = E.mh + +for kmer in kmers(record.sequence, prot_ksize): + hash = mmh3.hash128(kmer, seed=42) + mh.add_hash(hash) + +s = sourmash_lib.signature.SourmashSignature('', E, name=record.name) +print(sourmash_lib.signature.save_signatures([s])) diff --git a/utils/compute-prot-mh-another-way.py b/utils/compute-prot-mh-another-way.py new file mode 100755 index 000000000..b5f416e57 --- /dev/null +++ b/utils/compute-prot-mh-another-way.py @@ -0,0 +1,88 @@ +#! /usr/bin/env python +""" +Use the MurmurHash library mmh3 and separate Python code to calculate +a MinHash signature for translated protein sequence, as a way to do an +external check on our C++ implementation. + +The output of this is used in test_sourmash.py to verify our C++ code. +""" + +dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', + 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', + 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', + 'TGT':'C','TGC':'C', 'TGG':'W', + 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', + 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', + 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', + 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', + 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', + 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', + 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', + 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', + 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', + 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', + 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', + 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} + + +__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +def complement(s): + """ + Return complement of 's'. + """ + c = "".join(__complementTranslation[n] for n in s) + return c + + +def reverse(s): + """ + Return reverse of 's'. + """ + r = "".join(reversed(s)) + return r + + +def peptides(seq, start): + for i in range(start, len(seq), 3): + yield dna_to_aa.get(seq[i:i+3], "X") + + +def translate(seq): + for i in range(3): + pep = peptides(seq, i) + yield "".join(pep) + + revcomp = reverse(complement((seq))) + for i in range(3): + pep = peptides(revcomp, i) + yield "".join(pep) + +def kmers(seq, k): + for start in range(len(seq) - k + 1): + yield seq[start:start + k] + +### + +K = 21 + +import sys, screed +import mmh3 +import sourmash_lib +print('imported sourmash:', sourmash_lib, file=sys.stderr) +from sourmash_lib import MinHash +import sourmash_lib.signature + +record = next(iter(screed.open(sys.argv[1]))) +print('loaded', record.name, file=sys.stderr) + +E = sourmash_lib.Estimators(ksize=K, n=500, protein=True) +prot_ksize = int(K / 3) +mh = E.mh + +for trans in translate(record.sequence): + for kmer in kmers(trans, prot_ksize): + hash = mmh3.hash128(kmer, seed=42) + mh.add_hash(hash) + +s = sourmash_lib.signature.SourmashSignature('', E, name=record.name) +print(sourmash_lib.signature.save_signatures([s]))