In [1]:
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

import spacy
from spacy.matcher import Matcher

from thefuzz import fuzz, process


# Data

In [2]:
# set pandas to display all rows
pd.set_option('display.max_rows', None)

In [3]:
# Matched examples
df_matched = pd.read_csv('matched_data.csv')
df_matched

Unnamed: 0,source_1,source_2
0,"Cereals, barley",barley
1,Other fresh or chilled potatoes,potatoes
2,"Fruit, edible; cherries, fresh",cherries
3,"Bran, sharps and other residues, of maize",maize bran
4,"Cereals, millet",millets
5,"Oil seeds; sesamum seeds, whether or not broken",sesame seed
6,Cigarettes containing tobacco,cigarettes
7,"Tomato, fresh or chilled",tomatoes
8,"Pulp, bagasse and other waste of sugar manufac...",beet pulp
9,"Oil seeds; sunflower seeds, whether or not broken",sunflower seed


# Fuzzy string matching

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = LancasterStemmer()

In [None]:
def root_words(string):
    wrds = nltk.word_tokenize(string)
    roots = [lemmatizer.lemmatize(word.lower()) for word in wrds]
    #roots = [stemmer.stem(word.lower()) for word in wrds]
    return roots
root_words('computer')

In [None]:
def get_synonyms(string):
    string = string.strip()
    synonyms = []
    for syn in wordnet.synsets(string):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    res = []
    for syn in synonyms:
        if '_' in syn:
            res.extend(syn.split('_'))
            #res.append(' '.join(syn.split('_')))
        else:
            res.append(syn)
    return res
get_synonyms('mammals   ')

In [None]:
matched_data = []
def get_match(source_1, source_2):
    def get_ratios(i, j):
        print(f"'{i}' vs '{j}'")

        ratio = fuzz.ratio(i, j)
        print(f"Ratio : {ratio}")

        partial_ratio = fuzz.partial_ratio(root_words(i)[0], j)
        print(f"Partial Ratio : {partial_ratio}")

        token_sort_ratio = fuzz.token_sort_ratio(i, j)
        print(f"Token sort Ratio : {token_sort_ratio}")

        token_set_ratio = fuzz.token_set_ratio(i, j)
        print(f"Token set Ratio : {token_set_ratio}")
        print()

        return partial_ratio, token_set_ratio

    for i in source_2:
        i = i.strip()
        synonyms = get_synonyms(i) if ' ' not in i else None
        all_synonyms = ''
        if synonyms:
            synonym_str = ' '.join(synonyms)
            all_synonyms = i + ' ' + synonym_str

        for j in source_1:
            partial_ratio, token_set_ratio = get_ratios(i, j)
            if token_set_ratio == 100 or partial_ratio > 80:
                matched_data.append(dict(s1=j, s2=i))
                source_2.remove(i)
                source_1.remove(j)
                return get_match(source_1, source_2)
            if len(all_synonyms) > 0:
                partial_ratio = get_ratios(all_synonyms, j)[0]
                if partial_ratio > 80:
                    matched_data.append(dict(s1=j, s2=i))
                    source_2.remove(i)
                    source_1.remove(j)
                    return get_match(source_1, source_2)
            else:
                continue
    return matched_data
                    

In [None]:
# Matched examples
df_matched = pd.read_csv('matched_data.csv')
df_s1 = pd.read_csv('source_1.csv')
df_s2 = pd.read_csv('source_2.csv')

#src_1 = df_matched.source_1.to_list()
#src_2 = df_matched.source_2.to_list()
src_1 = df_s1.name.to_list()
src_2 = df_s2.name.to_list()

df_data = get_match(src_1, src_2)
df = pd.DataFrame(df_data)
print(df)

In [None]:
#src_1 = df_matched.source_1.to_list()
#src_2 = df_matched.source_2.to_list()

In [4]:
df_s1 = pd.read_csv('source_1.csv')
df_s2 = pd.read_csv('source_2.csv')
src_1 = df_s1.name.to_list()
src_2 = df_s2.name.to_list()

In [None]:
for i in src_1:
    print(i)
    print('partial token sort ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.partial_token_sort_ratio))
    print('partial token set ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.partial_token_set_ratio))
    print('token sort ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.token_sort_ratio))
    print(' token set ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.token_set_ratio))
    print('partial ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.partial_ratio))
    print('ratio')
    print(process.extractOne(i, src_2, scorer = fuzz.ratio))
    
    print()

Horses, asses, mules and hinnies; live, pure-bred breeding animals
partial token sort ratio
('hinnies', 100)
partial token set ratio
('goal 11 sustainable cities and communities', 100)
token sort ratio
('diagnosis and recommendation integrated system', 46)
 token set ratio
('pure breeding', 100)
partial ratio
('hinnies', 100)
ratio
('hong kong special administrative region of china', 44)

Horses; live, pure-bred breeding animals
partial token sort ratio
('animals', 100)
partial token set ratio
('dominance of animals', 100)
token sort ratio
('fur bearing animals', 60)
 token set ratio
('pure breeding', 100)
partial ratio
('animals', 100)
ratio
('fur bearing animals', 58)

Horses; live, other than pure-bred breeding animals
partial token sort ratio
('animals', 100)
partial token set ratio
('dominance of animals', 100)
token sort ratio
('psittacine beak and feather disease', 52)
 token set ratio
('pure breeding', 100)
partial ratio
('animals', 100)
ratio
('nutrient transport in animals', 