In [1]:
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

import spacy
from spacy.matcher import Matcher

from thefuzz import fuzz, process


# Data

In [2]:
# set pandas to display all rows
pd.set_option('display.max_rows', None)

In [3]:
# Matched examples
df_matched = pd.read_csv('matched_data.csv')
df_matched

Unnamed: 0,source_1,source_2
0,"Cereals, barley",barley
1,Other fresh or chilled potatoes,potatoes
2,"Fruit, edible; cherries, fresh",cherries
3,"Bran, sharps and other residues, of maize",maize bran
4,"Cereals, millet",millets
5,"Oil seeds; sesamum seeds, whether or not broken",sesame seed
6,Cigarettes containing tobacco,cigarettes
7,"Tomato, fresh or chilled",tomatoes
8,"Pulp, bagasse and other waste of sugar manufac...",beet pulp
9,"Oil seeds; sunflower seeds, whether or not broken",sunflower seed


# Fuzzy string matching

In [4]:
lemmatizer = WordNetLemmatizer()
stemmer = LancasterStemmer()

In [5]:
def root_words(string):
    wrds = nltk.word_tokenize(string)
    roots = [lemmatizer.lemmatize(word.lower()) for word in wrds]
    #roots = [stemmer.stem(word.lower()) for word in wrds]
    return roots
root_words('computer')

['computer']

In [6]:
def get_synonyms(string):
    synonyms = []
    for syn in wordnet.synsets(string):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    res = []
    for syn in synonyms:
        if '_' in syn:
            res.extend(syn.split('_'))
            #res.append(' '.join(syn.split('_')))
        else:
            res.append(syn)
    return res
get_synonyms('computer')

['computer',
 'computing',
 'machine',
 'computing',
 'device',
 'data',
 'processor',
 'electronic',
 'computer',
 'information',
 'processing',
 'system',
 'calculator',
 'reckoner',
 'figurer',
 'estimator',
 'computer']

In [7]:
def get_match(source_1, source_2):
    def get_ratios(i, j):
        print(f"'{i}' vs '{j}'")

        ratio = fuzz.ratio(i, j)
        print(f"Ratio : {ratio}")

        partial_ratio = fuzz.partial_ratio(root_words(i)[0], j)
        print(f"Partial Ratio : {partial_ratio}")

        token_sort_ratio = fuzz.token_sort_ratio(i, j)
        print(f"Token sort Ratio : {token_sort_ratio}")

        token_set_ratio = fuzz.token_set_ratio(i, j)
        print(f"Token set Ratio : {token_set_ratio}")
        print()

        return partial_ratio, token_set_ratio

    for i in source_2:
        i = i.strip()
        synonyms = get_synonyms(i) if ' ' not in i else None
        all_synonyms = ''
        if synonyms:
            synonym_str = ' '.join(synonyms)
            all_synonyms = i + ' ' + synonym_str

        for j in source_1:
            token_set_ratio = get_ratios(i, j)[1]
            if token_set_ratio == 100:
                break
            if len(all_synonyms) > 0:
                partial_ratio = get_ratios(all_synonyms, j)[0]
                if partial_ratio > 80:
                    break
            else:
                continue
                    

In [8]:
src_1 = df_matched.source_1
src_2 = df_matched.source_2
get_match(src_1, src_2)

'barley' vs 'Cereals, barley'
Ratio : 57
Partial Ratio : 100
Token sort Ratio : 60
Token set Ratio : 100

'potatoes' vs 'Cereals, barley'
Ratio : 17
Partial Ratio : 17
Token sort Ratio : 27
Token set Ratio : 27

'potatoes potato white potato Irish potato murphy spud tater potato white potato white potato vine Solanum tuberosum' vs 'Cereals, barley'
Ratio : 14
Partial Ratio : 17
Token sort Ratio : 11
Token set Ratio : 17

'potatoes' vs 'Other fresh or chilled potatoes'
Ratio : 41
Partial Ratio : 100
Token sort Ratio : 41
Token set Ratio : 100

'cherries' vs 'Cereals, barley'
Ratio : 35
Partial Ratio : 33
Token sort Ratio : 45
Token set Ratio : 45

'cherries cherry cherry cherry tree cherry cerise cherry cherry red' vs 'Cereals, barley'
Ratio : 20
Partial Ratio : 33
Token sort Ratio : 20
Token set Ratio : 36

'cherries' vs 'Other fresh or chilled potatoes'
Ratio : 36
Partial Ratio : 50
Token sort Ratio : 36
Token set Ratio : 36

'cherries cherry cherry cherry tree cherry cerise cherry ch