### Load the packages

In [1]:
# pip install jellyfish
# pip install fuzzywuzzy
# pip install xgboost
# conda install -c conda-forge python-levenshtein

import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import jaccard_score

### Load the data

In [2]:
df = pd.read_csv('./data/aux_municipality2.csv', sep = ",")
df

Unnamed: 0,Mun
0,Capaccio
1,Casale di Carinola
2,Casertavecchia
3,Celsi di Forino
4,Forio dâ€™Ischia
5,Forio d'Ischia
6,Frasso Teleseino
7,Frazione Capitignano
8,Frazione Capitignano\tTramonti
9,"Ischia, Fraz. Campagnano"


In [3]:
dfp = pd.read_csv('./data/lau.csv', sep = ",", encoding='latin1')
dfp

Unnamed: 0,nuts3_code,lau_code,lau_name,pop,area
0,ITC11,1001,Agliè,2548,13146200
1,ITC11,1002,Airasca,3569,15739300
2,ITC11,1003,Ala di Stura,448,46331500
3,ITC11,1004,Albiano d'Ivrea,1650,11731400
4,ITC11,1006,Almese,6448,17875600
...,...,...,...,...,...
7898,ITI45,60087,Vico nel Lazio,2110,45842100
7899,ITI45,60088,Villa Latina,1156,17022600
7900,ITI45,60089,Villa Santa Lucia,2547,17767200
7901,ITI45,60090,Villa Santo Stefano,1669,20098900


In [4]:
#df1 = pd.crosstab(df.Mun, dfp.lau_name)
from itertools import product

comb = list(product(df.Mun, dfp.lau_name))
df1 = pd.DataFrame(data=comb, columns=['internal_name','external_name'])
df1

Unnamed: 0,internal_name,external_name
0,Capaccio,Agliè
1,Capaccio,Airasca
2,Capaccio,Ala di Stura
3,Capaccio,Albiano d'Ivrea
4,Capaccio,Almese
...,...,...
158055,Sant'Agata dei Goti,Vico nel Lazio
158056,Sant'Agata dei Goti,Villa Latina
158057,Sant'Agata dei Goti,Villa Santa Lucia
158058,Sant'Agata dei Goti,Villa Santo Stefano


### Feature engineering

In [5]:
def jaccard_score(external_name, internal_name):

    external_chars = set(re.findall(r'[a-zA-Z0-9]+', external_name))
    internal_chars = set(re.findall(r'[a-zA-Z0-9]+', internal_name)) 
    union = external_chars.union(internal_chars)
    intersection = external_chars.intersection(internal_chars)

    if len(external_chars)==0 and len(internal_chars) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [10]:
def engineer_features(df):

    df['internal_name'] = df['internal_name'].str.lower()
    df['external_name'] = df['external_name'].str.lower()

    #df['levenshtein_distance'] = df.apply(
    #lambda x: jf.levenshtein_distance(x['external_name'], x['internal_name']), axis=1)

    #df['damerau_levenshtein_distance'] = df.apply(
    #lambda x: jf.damerau_levenshtein_distance(x['external_name'], x['internal_name']), axis=1)

    #df['hamming_distance'] = df.apply(
    #lambda x: jf.hamming_distance(x['external_name'], x['internal_name']), axis=1)

    #df['jaro_similarity'] = df.apply(
    #lambda x: jf.jaro_similarity(x['external_name'], x['internal_name']), axis=1)

    #df['jaro_winkler_similarity'] = df.apply(
    #lambda x: jf.jaro_winkler_similarity(x['external_name'], x['internal_name']), axis=1)

    #df['match_rating_comparison'] = df.apply(
    #lambda x: jf.match_rating_comparison(x['external_name'], x['internal_name']), axis=1).fillna(0).astype(int)

    df['jaccard_score'] = df.apply(
    lambda x: jaccard_score(x['external_name'], x['internal_name']), axis=1)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

levenshtein_distance

In [11]:
df1 = engineer_features(df1)

In [12]:
df1.sort_values('levenshtein_distance', ascending=True).drop_duplicates(['internal_name'])

Unnamed: 0,internal_name,external_name,jaccard_score,levenshtein_distance
50992,frasso teleseino,frasso telesino,0.333333,1
106412,pollenatrocchia,pollena trocchia,0.0,1
153766,sant'agata dei goti,sant'agata de' goti,0.6,1
3008,capaccio,calascio,0.0,2
43139,forio d'ischia,barano d'ischia,0.5,4
23495,casertavecchia,civitavecchia,0.0,4
146051,santâ€™angelo allâ€™esca,sant'angelo all'esca,1.0,6
130239,san michele di pratola,san michele di serino,0.6,6
23858,celsi di forino,mombello di torino,0.2,7
14165,casale di carinola,casale di scodosia,0.5,7


jaccard_score

In [7]:
df1 = engineer_features(df1)

In [9]:
df1.sort_values('jaccard_score', ascending=False).drop_duplicates(['internal_name'])

Unnamed: 0,internal_name,external_name,jaccard_score
146051,santâ€™angelo allâ€™esca,sant'angelo all'esca,1.0
114439,s. angelo allâ€™esca,sant'angelo all'esca,0.6
131567,san michele di pratola,san michele di ganzaria,0.6
153766,sant'agata dei goti,sant'agata de' goti,0.6
134430,san michele di pratola serra,chiusa di san michele,0.5
35236,forio dâ€™ischia,barano d'ischia,0.5
3852,capaccio,capaccio paestum,0.5
98359,piedimonte di sessa aurunca,sessa aurunca,0.5
82930,macchia di montecorvino rovella,montecorvino rovella,0.5
14165,casale di carinola,casale di scodosia,0.5
