In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

Fuzzy string matching or searching is a process of approximating strings that matches a particular pattern. 

**Note** that it gives an approximate and there is no guarantee that the string can be exact, however, sometimes the string accurately matches the pattern. How close the string is to a given match is measured by the edit distance. 

FuzzyWuzzy uses [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to calculate the edit distance.

Source Links
- https://galaxydatatech.com/2017/12/31/fuzzy-string-matching-pandas-fuzzywuzzy/
- https://medium.com/@rtjeannier/combining-data-sets-with-fuzzy-matching-17efcb510ab2
- https://www.kaggle.com/rtatman/data-cleaning-challenge-inconsistent-data-entry

#### Simple Ratio

In [8]:
fuzz.ratio("Hello world", "Hello world!")

96

#### Partial Ratio

In [10]:
fuzz.partial_ratio("Hello world", "Hello world!")

100

#### Process 
**Extract and ExtractOne**

In [6]:
# extract strings from a list of choices
choices = ["Hello world is an introductory phrase in programming", 
           "Game of throne is a movie", 
           "Albert Einstein developed the theory of relativity", 
           "Asia is the largest continent in the world"]
 
process.extract("Hello world", choices, limit=2)
process.extractOne("Hello world", choices)

[('Hello world is an introductory phrase in programming', 90),
 ('Asia is the largest continent in the world', 86)]

('Hello world is an introductory phrase in programming', 90)

In [16]:
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

# extract top 2 matches
process.extract("new york jets", choices, limit=2)

# extract best
process.extractOne("cowboys", choices)
process.extractOne("new york jets", choices)

[('New York Jets', 100), ('New York Giants', 79)]

('Dallas Cowboys', 90)

('New York Jets', 100)

#### Working with Pandas

In [27]:
# correct country names
correct_names = pd.read_csv('country-names.csv',encoding="ISO-8859-1")
# correct_names = correct_names['name'].dropna()
correct_names.head()

Unnamed: 0,name
0,Åland Islands
1,Albania
2,Andorra
3,Austria
4,Belarus


In [33]:
#Wrong country names dataset
wrong_names = pd.read_csv("wrong-country-names.csv",encoding="ISO-8859-1")
wrong_names.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,region-code,sub-region-code
0,Ålend Islends,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,150.0,154.0
1,elbenie,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,150.0,39.0
2,endorre,AD,AND,20,ISO 3166-2:AD,Europe,Southern Europe,150.0,39.0
3,eustrie,AT,AUT,40,ISO 3166-2:AT,Europe,Western Europe,150.0,155.0
4,Belerus,BY,BLR,112,ISO 3166-2:BY,Europe,Eastern Europe,150.0,151.0


In [43]:
# Defining function to get the right score and matches
def match_name(namedf, names_list, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the correct list
    for name2 in names_list:
        #Finding fuzzy match score
        score = fuzz.ratio(namedf['name'], name2)
        # Checking if we are above our threshold and have a better score
        if (score > min_score) & (score > max_score):
            max_name = name2
            max_score = score
    return max_name, max_score

In [46]:
# creating two columns
# most_matching_name -> whichever had the max score
# score -> the ratio score observed

wrong_names['most_matching_name'], wrong_names['score'] = zip(*wrong_names.apply(lambda x: match_name(x, correct_names['name'],0), axis=1))

In [48]:
wrong_names[['name','most_matching_name','score']]

Unnamed: 0,name,most_matching_name,score
0,Ålend Islends,Åland Islands,85
1,elbenie,Albania,57
2,endorre,Andorra,71
3,eustrie,Austria,71
4,Belerus,Belarus,86
5,Belgium,Belgium,100
6,Bosnie end Herzegovine,Bosnia and Herzegovina,86
7,Bulgerie,Bulgaria,75
8,Croetie,Croatia,71
9,Czech Republic,Czech Republic,100
