### Load the packages

In [2]:
# pip install jellyfish
# pip install fuzzywuzzy
# pip install xgboost
# conda install -c conda-forge python-levenshtein

import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [3]:
df = pd.read_csv('./00_data/LAUs_match.csv')
df = df.rename (columns = {'errors':'internal_name', 'correct':'external_name'})
df.head()

Unnamed: 0,internal_name,external_name,category,match
0,Massa di Faicchio,Faicchio,LAU,1
1,San Michele di Pratola Serra,Pratola Serra,LAU,1
2,Frasso Teleseino,Frasso Telesino,LAU,1
3,Forio d’Ischia,Forio,LAU,1
4,Macchia di Montecorvino Rovella,Montecorvino Rovella,LAU,1


### Feature engineering

In [4]:
def matching_numbers(external_name, internal_name):

    external_numbers = set(re.findall(r'[0-9]+', external_name))
    internal_numbers = set(re.findall(r'[0-9]+', internal_name))    
    union = external_numbers.union(internal_numbers)
    intersection = external_numbers.intersection(internal_numbers)

    if len(external_numbers)==0 and len(internal_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [5]:
def engineer_features(df):

    df['internal_name'] = df['internal_name'].str.lower()
    df['external_name'] = df['external_name'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['external_name'], 
                                      x['internal_name']), axis=1)

    df['damerau_levenshtein_distance'] = df.apply(
    lambda x: jf.damerau_levenshtein_distance(x['external_name'], 
                                              x['internal_name']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['external_name'], 
                                  x['internal_name']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['external_name'], 
                                  x['internal_name']), axis=1)

    df['jaro_winkler_similarity'] = df.apply(
    lambda x: jf.jaro_winkler_similarity(x['external_name'], 
                                         x['internal_name']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['external_name'], 
                                         x['internal_name']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['external_name'], 
                         x['internal_name']), axis=1)

    df['partial_ratio'] = df.apply(
    lambda x: fuzz.partial_ratio(x['external_name'], 
                                 x['internal_name']), axis=1)

    df['token_sort_ratio'] = df.apply(
    lambda x: fuzz.token_sort_ratio(x['external_name'], 
                                    x['internal_name']), axis=1)

    df['token_set_ratio'] = df.apply(
    lambda x: fuzz.token_set_ratio(x['external_name'], 
                                   x['internal_name']), axis=1)

    df['w_ratio'] = df.apply(
    lambda x: fuzz.WRatio(x['external_name'], 
                          x['internal_name']), axis=1)

    df['uq_ratio'] = df.apply(
    lambda x: fuzz.UQRatio(x['external_name'], 
                          x['internal_name']), axis=1)

    df['q_ratio'] = df.apply(
    lambda x: fuzz.QRatio(x['external_name'], 
                          x['internal_name']), axis=1)    

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['external_name'], 
                               x['internal_name']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)

    df['log_fuzz_score'] = (df['ratio'] + df['partial_ratio'] + 
                            df['token_sort_ratio'] + df['token_set_ratio']).apply(np.log)

    df['log_fuzz_score_numbers'] = df['log_fuzz_score'] + (df['matching_numbers']).apply(np.log)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [6]:
df = engineer_features(df)

In [7]:
df

Unnamed: 0,internal_name,external_name,category,match,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,massa di faicchio,faicchio,LAU,1,9,9,16,0.517157,0.517157,0,...,100,64,100,90,64,64,1,0.693147,5.793014,5.793014
1,san michele di pratola serra,pratola serra,LAU,1,15,15,28,0.508700,0.508700,1,...,100,63,100,90,63,63,1,0.693147,5.786897,5.786897
2,frasso teleseino,frasso telesino,LAU,1,1,1,4,0.979167,0.987500,1,...,93,97,97,97,97,97,1,0.693147,5.950643,5.950643
3,forio d’ischia,forio,LAU,1,9,9,9,0.785714,0.871429,0,...,100,53,100,90,53,53,1,0.693147,5.723585,5.723585
4,macchia di montecorvino rovella,montecorvino rovella,LAU,1,11,11,29,0.731720,0.758548,1,...,100,78,100,90,78,78,1,0.693147,5.874931,5.874931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,tufo,tufo,LAU,1,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1,0.693147,5.991465,5.991465
164,vallo della lucania,vallo della lucania,LAU,1,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1,0.693147,5.991465,5.991465
165,venticano,venticano,LAU,1,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1,0.693147,5.991465,5.991465
166,vietri sul mare,vietri sul mare,LAU,1,0,0,0,1.000000,1.000000,1,...,100,100,100,100,100,100,1,0.693147,5.991465,5.991465


### Examine correlations

In [8]:
df[df.columns[1:]].corr()['match'][:].sort_values(ascending=False)

match                           1.000000
partial_ratio                   0.820909
w_ratio                         0.796542
token_set_ratio                 0.770365
log_fuzz_score                  0.713695
log_fuzz_score_numbers          0.713695
token_sort_ratio                0.648725
uq_ratio                        0.564983
q_ratio                         0.564983
ratio                           0.561692
jaro_similarity                 0.492898
jaro_winkler_similarity         0.434090
match_rating_comparison         0.263574
hamming_distance               -0.458895
levenshtein_distance           -0.507955
damerau_levenshtein_distance   -0.507955
matching_numbers                     NaN
matching_numbers_log                 NaN
Name: match, dtype: float64

### Create test and train data

In [9]:
X = df[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']].values
y = df['match'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [10]:
print (X_train.shape, X_test.shape)

(117, 17) (51, 17)


### Select the model

In [11]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "MLP": MLPClassifier(),
    "XGBClassifer tuned": XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1)
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision', 'recall','f1','roc','run_time','tp','fp', 'tn','fn'], dtype=object)

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results.head(10)



Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DummyClassifier_stratified,0.843137,0.156863,0.914894,0.914894,0.914894,0.457447,0.0,0,4,43,4
1,KNeighborsClassifier,0.980392,0.019608,1.0,0.978723,0.989247,0.989362,0.0,4,0,46,1
2,XGBClassifier,0.941176,0.058824,0.958333,0.978723,0.968421,0.739362,0.01,2,2,46,1
3,DecisionTreeClassifier,0.960784,0.039216,0.978723,0.978723,0.978723,0.864362,0.0,3,1,46,1
4,RandomForestClassifier,0.960784,0.039216,0.978723,0.978723,0.978723,0.864362,0.0,3,1,46,1
5,AdaBoostClassifier,0.960784,0.039216,0.978723,0.978723,0.978723,0.864362,0.0,3,1,46,1
6,GradientBoostingClassifier,0.960784,0.039216,0.978723,0.978723,0.978723,0.864362,0.0,3,1,46,1
7,Perceptron,0.921569,0.078431,0.921569,1.0,0.959184,0.5,0.0,0,4,47,0
8,MLP,0.941176,0.058824,0.94,1.0,0.969072,0.625,0.0,1,3,47,0
9,XGBClassifer tuned,0.921569,0.078431,0.921569,1.0,0.959184,0.5,0.0,0,4,47,0


### Select the best model and Fit

In [12]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [13]:
y_pred = model.predict(X_test)

### Assess model performance

In [14]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       1.00      0.98      0.99        47
   not match       0.80      1.00      0.89         4

    accuracy                           0.98        51
   macro avg       0.90      0.99      0.94        51
weighted avg       0.98      0.98      0.98        51



In [15]:
results = pd.DataFrame(data={'predictions': y_pred, 'actual': y_test})
results['result'] = np.where(results['predictions']==results['actual'], 1, 0)
results.head(20)

Unnamed: 0,predictions,actual,result
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1
5,1,1,1
6,1,1,1
7,1,1,1
8,1,1,1
9,0,0,1


### Generate new predictions

In [16]:
def get_closest_matches(external_name):

    unique_internal_names = df['internal_name'].unique().tolist()
    closest_matches = process.extract(external_name, 
                  unique_internal_names, 
                  scorer=fuzz.token_set_ratio)

    return closest_matches

### Preprocessing the data

In [17]:
def prepare_data(external_name):

    closest_matches = get_closest_matches(external_name)

    df = pd.DataFrame(columns=['external_name', 'internal_name'])

    for match in closest_matches:
        row = {'external_name': external_name, 'internal_name': match[0]}
        df = df.append(row, ignore_index=True)

    return df

In [18]:
closest_data = prepare_data("Aurunca")
closest_data.head()

Unnamed: 0,external_name,internal_name
0,Aurunca,s.carlo di sessa aurunca
1,Aurunca,piedimonte di sessa aurunca
2,Aurunca,sessa aurunca
3,Aurunca,aquara
4,Aurunca,taurasi


In [19]:
data = engineer_features(closest_data)
data = data[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']]

In [20]:
data.head()

Unnamed: 0,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,matching_numbers_log,matching_numbers,token_set_ratio,token_sort_ratio,partial_ratio,ratio,log_fuzz_score,log_fuzz_score_numbers,match_rating_comparison,q_ratio,uq_ratio,w_ratio
0,17,17,24,0.496032,0.496032,0.693147,1,100,45,100,45,5.669881,5.669881,1,45,45,90
1,20,20,27,0.453263,0.453263,0.693147,1,100,41,100,41,5.641907,5.641907,1,41,41,90
2,6,6,12,0.442002,0.442002,0.693147,1,100,70,100,70,5.828946,5.828946,1,70,70,90
3,4,4,6,0.746032,0.771429,0.693147,1,62,62,50,62,5.463832,5.463832,0,62,62,62
4,5,5,7,0.714286,0.714286,0.693147,1,57,57,62,57,5.451038,5.451038,0,57,57,57


### Generate the predictions

In [21]:
y_pred = model.predict_proba(data)[:,1]

In [22]:
data = data.assign(prediction=y_pred)
data = data.merge(closest_data)
data[['external_name','internal_name','prediction']].head()

Unnamed: 0,external_name,internal_name,prediction
0,aurunca,s.carlo di sessa aurunca,1.0
1,aurunca,piedimonte di sessa aurunca,1.0
2,aurunca,sessa aurunca,1.0
3,aurunca,aquara,0.333333
4,aurunca,taurasi,0.333333
