From f8a6594a66d3c974a1134e4d4cbc749c3a4803ce Mon Sep 17 00:00:00 2001 From: "Dylan W. Schwilk" Date: Mon, 29 Aug 2022 09:06:17 -0500 Subject: [PATCH] Change in how fuzzy_name_matches expects data I might change this back but this is the workflow for the mycorrhizal state project. The fyzzy match funciton now expects names lists as two column data with the first column being the name as a single string, followed by a tab, then the second column being the aprsed name as returned by my mnodifed version of Cam Webb's taxon-tools/parsenames gawk script. --- scripts/fuzzy_match.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/fuzzy_match.py b/scripts/fuzzy_match.py index 10860d1..8d2e4be 100755 --- a/scripts/fuzzy_match.py +++ b/scripts/fuzzy_match.py @@ -4,6 +4,11 @@ # Utilities for fuzzy matching taxon names +"""This version needs both lists to include the raw scientific name as well as a +parsed version. A tab separates the two versions on a line and the pipe +character separates fields within the parsed name (as returned by Cam Webb's +parsename gawk script)""" + from Levenshtein import jaro_winkler as jaro_winkler import re @@ -21,6 +26,7 @@ THRESHOLD_DIST_SE = 3 THRESHOLD_JW = 0.94 # Final pass Jaro-Winkler cutoff for match + def is_gender_switch(seA, seB): """Check if specific epithet difference is simply one of latin gender.""" # constants @@ -107,7 +113,6 @@ def get_matches(pattern, matcher, limit): return(matcher.search(pattern, limit)) - def best_match(pattern, m, limit, jw_threshold): """Return best match to pattern in Matcher object, m. This is a two step process: First all candidate matches are found up to limit edits from pattern. @@ -160,9 +165,10 @@ def fuzzy_match_name_list(dlist, elist, outfile=sys.stdout, bname = best_genus + " " + best_se name = genus + " " + se res[name] = bname + # check for simple latin gender switch in se if is_gender_switch(se, best_se) : gender_switch = "True" - else : gender_switch = "False" - outfile.write(name + "," + bname + "," + str(genus_jw) + "," + str(jw) + "," + gender_switch + "\n") + else : gender_switch = "False" + outfile.write(name + "\t" + bname + "\t" + str(genus_jw) + "\t" + str(jw) + "\t" + gender_switch + "\n") # else : # logger.info("Unmatched: " + genus + "\n")