Updates for python3

schwilklab · Aug 29, 2022 · 2f0fba4 · 2f0fba4
1 parent 7a4c751
commit 2f0fba4
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 26 deletions.
diff --git a/scripts/automata.py b/scripts/automata.py
@@ -128,7 +128,7 @@ def find_next_edge(self, s, x):
     if x is None:
       x = u'\0'
     else:
-      x = unichr(ord(x) + 1)
+      x = chr(ord(x) + 1)
     state_transitions = self.transitions.get(s, {})
     if x in state_transitions or s in self.defaults:
       return x

diff --git a/scripts/fuzzy_match.py b/scripts/fuzzy_match.py
@@ -56,27 +56,44 @@ def is_gender_switch(seA, seB):
     except :
         return False
 
-
-def genus_species(names):
-    """Split names into genus and set of species. Return dictionary of strings to
-sets."""
+
+# def parse_species_line(line):
+#     """Return as a three part tuple the original full name (first field before
+# tab), the genus, an created specific epithet field that includes the specific
+#     epithet AND any infraspecificc epithet."""
+#     # print(line)
+#     full_name, parsed_name = line.split("\t")
+#     if(parsed_name == "") return (
+#     gx, genus, sx, se, infra_rank, infra_e, author = parsed_name.split("|")
+#     result = (full_name, genus, ' '.join(filter(None, [se, infra_rank, infra_e])))
+#     return(result)
+
+
+def genus_species(lines):
+    """Return dictionary of strings to sets. sets contain parsed species name lines
+in three part tuple. name part stored as three part tuple the original full
+name (first field before tab), the genus, an created specific epithet field
+that includes the specific epithet AND any infraspectifc epithet."""
     genera = {}
-    for name in names:
-        try :
-            parts = name.split()  # ignore all parts after
-            genus = parts[0]
-            se = parts[1]
-            genera.setdefault(genus, set())
-            genera[genus].add(se)
-        except :
-            print parts
+    for l in lines:
+        sl = l.split("\t")
+        if(len(sl) > 1):
+            full_name, parsed_name = sl
+            gx, genus, sx, se, infra_rank, infra_e, author = parsed_name.split("|")
+            pname = (full_name, genus, ' '.join(filter(None, [se, infra_rank, infra_e])))     
+            genera.setdefault(pname[1], set())
+            genera[pname[1]].add(pname)
     return(genera)
 
+
+## TODO: need to wrap the tuple up as an object and define equality, comparison, etc.
+
+
 def best_jw_match(pattern, matches, jw_threshold):
     """Find match within list of candidates with highest Jaro-Winkler similarity.
 Return None if no match is higher than jw_threshold. Returns tuple (match,
 jw_similarity) """
-    jw_dists = map(lambda n : jaro_winkler(pattern,n), matches)
+    jw_dists = list(map(lambda n : jaro_winkler(pattern,n), matches))
     max_jw = max(jw_dists)
     if(max_jw >= jw_threshold) :
         bmatch = matches[jw_dists.index(max_jw)]
@@ -102,33 +119,39 @@ def best_match(pattern, m, limit, jw_threshold):
         return(best_jw_match(pattern, matches, jw_threshold))
     return((None,None))
 
-
 def fuzzy_match_name_list(dlist, elist, outfile=sys.stdout,
                           genus_dist = THRESHOLD_DIST_GENUS,
                           se_dist = THRESHOLD_DIST_SE,
                           threshold_jw = THRESHOLD_JW):
-    """Match all taxon names in dlist to best match in elist. Return dictionary
-with matchable names in dlist as keys and best match in elist as values. The
-function writes the output as it progresses so that state is saved (slow
-process), default output is stdout."""
+    """Match all taxon names in dlist to best match in elist.
+
+    This version needs both lists to include the raw scientific name as well as a
+parsed version. A tab separates the two versions on a line and the pipe
+character separates fields within the parsed name (as returned by Cam Webb's
+parsename gawk script)
+
+    Return dictionary with matchable names from dlist (unparsed) as keys and
+best match in elist (unparsed) as values. The function writes the output as it
+progresses so that state is saved (slow process), default output is stdout.
+
+    """
 
     ## Get genus->species dicts for both lists
     enames = genus_species(elist)
-    egenera = enames.keys()
-    egenera.sort()
-
+    egenera = sorted(enames.keys())
     dnames = genus_species(dlist)
 
     res = {}
     genus_matcher = Matcher(egenera, True)
     count=0
     # write header
-    outfile.write("dlist,elist,genus_jw,se_jw,gender_switch\n")
+    outfile.write("dlist\telist\tgenus_jw\tse_jw\tgender_switch\n")
     for genus in sorted(dnames.keys()) :
         best_genus, genus_jw = best_match(genus, genus_matcher, genus_dist, threshold_jw)
         if best_genus :
-            se_matcher = Matcher(sorted(enames[best_genus]), True)
-            for se in dnames[genus]:
+            se_matcher = Matcher(list(map(lambda x : x[2], enames[best_genus])))
+            for se in map(lambda x : x[2], dnames[genus]):
+#                print(se)
                 if count % 100 == 0 : logger.info(str(count) + ": " + genus)
                 count = count+1
                 (best_se, jw) = best_match(se, se_matcher, se_dist, threshold_jw)