Skip to content

Commit

Permalink
Change in how fuzzy_name_matches expects data
Browse files Browse the repository at this point in the history
I might change this back but this is the workflow for the mycorrhizal
state project.  The fyzzy match funciton now expects names lists as two
column data with the first column being the name as a single string,
followed by a tab, then the second column being the aprsed name as
returned by my mnodifed version of Cam Webb's taxon-tools/parsenames
gawk script.
  • Loading branch information
dschwilk committed Aug 29, 2022
1 parent 2f0fba4 commit f8a6594
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions scripts/fuzzy_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

# Utilities for fuzzy matching taxon names

"""This version needs both lists to include the raw scientific name as well as a
parsed version. A tab separates the two versions on a line and the pipe
character separates fields within the parsed name (as returned by Cam Webb's
parsename gawk script)"""

from Levenshtein import jaro_winkler as jaro_winkler
import re

Expand All @@ -21,6 +26,7 @@
THRESHOLD_DIST_SE = 3
THRESHOLD_JW = 0.94 # Final pass Jaro-Winkler cutoff for match


def is_gender_switch(seA, seB):
"""Check if specific epithet difference is simply one of latin gender."""
# constants
Expand Down Expand Up @@ -107,7 +113,6 @@ def get_matches(pattern, matcher, limit):
return(matcher.search(pattern, limit))



def best_match(pattern, m, limit, jw_threshold):
"""Return best match to pattern in Matcher object, m. This is a two step
process: First all candidate matches are found up to limit edits from pattern.
Expand Down Expand Up @@ -160,9 +165,10 @@ def fuzzy_match_name_list(dlist, elist, outfile=sys.stdout,
bname = best_genus + " " + best_se
name = genus + " " + se
res[name] = bname
# check for simple latin gender switch in se
if is_gender_switch(se, best_se) : gender_switch = "True"
else : gender_switch = "False"
outfile.write(name + "," + bname + "," + str(genus_jw) + "," + str(jw) + "," + gender_switch + "\n")
else : gender_switch = "False"
outfile.write(name + "\t" + bname + "\t" + str(genus_jw) + "\t" + str(jw) + "\t" + gender_switch + "\n")

# else :
# logger.info("Unmatched: " + genus + "\n")
Expand Down

0 comments on commit f8a6594

Please sign in to comment.