### Load the packages

In [1]:
# pip install jellyfish
# pip install fuzzywuzzy
# pip install xgboost
# conda install -c conda-forge python-levenshtein

import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import jaccard_score



### Load the data

In [2]:
df = pd.read_csv('product_matching_synthetic.csv')
df.head()

Unnamed: 0,external_name,internal_name,category_label,match
0,apple iphone 8 plus 64gb silver,Apple iPhone 8 Plus 64GB,Mobile Phones,1
1,apple iphone 8 plus 64 gb spacegrau,Apple iPhone 8 Plus 64GB,Mobile Phones,1
2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Apple iPhone 8 Plus 64GB,Mobile Phones,1
3,apple iphone 8 plus 64gb space grey,Apple iPhone 8 Plus 64GB,Mobile Phones,1
4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Apple iPhone 8 Plus 64GB,Mobile Phones,1


### Feature engineering

In [3]:
def matching_numbers(external_name, internal_name):

    external_numbers = set(re.findall(r'[0-9]+', external_name))
    internal_numbers = set(re.findall(r'[0-9]+', internal_name))    
    union = external_numbers.union(internal_numbers)
    intersection = external_numbers.intersection(internal_numbers)

    if len(external_numbers)==0 and len(internal_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [4]:
def engineer_features(df):

    df['internal_name'] = df['internal_name'].str.lower()
    df['external_name'] = df['external_name'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['external_name'], 
                                      x['internal_name']), axis=1)

    df['damerau_levenshtein_distance'] = df.apply(
    lambda x: jf.damerau_levenshtein_distance(x['external_name'], 
                                              x['internal_name']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['external_name'], 
                                  x['internal_name']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['external_name'], 
                                  x['internal_name']), axis=1)

    df['jaro_winkler_similarity'] = df.apply(
    lambda x: jf.jaro_winkler_similarity(x['external_name'], 
                                         x['internal_name']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['external_name'], 
                                         x['internal_name']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['external_name'], 
                         x['internal_name']), axis=1)

    df['partial_ratio'] = df.apply(
    lambda x: fuzz.partial_ratio(x['external_name'], 
                                 x['internal_name']), axis=1)

    df['token_sort_ratio'] = df.apply(
    lambda x: fuzz.token_sort_ratio(x['external_name'], 
                                    x['internal_name']), axis=1)

    df['token_set_ratio'] = df.apply(
    lambda x: fuzz.token_set_ratio(x['external_name'], 
                                   x['internal_name']), axis=1)

    df['w_ratio'] = df.apply(
    lambda x: fuzz.WRatio(x['external_name'], 
                          x['internal_name']), axis=1)

    df['uq_ratio'] = df.apply(
    lambda x: fuzz.UQRatio(x['external_name'], 
                          x['internal_name']), axis=1)

    df['q_ratio'] = df.apply(
    lambda x: fuzz.QRatio(x['external_name'], 
                          x['internal_name']), axis=1)    

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['external_name'], 
                               x['internal_name']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)

    df['log_fuzz_score'] = (df['ratio'] + df['partial_ratio'] + 
                            df['token_sort_ratio'] + df['token_set_ratio']).apply(np.log)

    df['log_fuzz_score_numbers'] = df['log_fuzz_score'] + (df['matching_numbers']).apply(np.log)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [5]:
df = engineer_features(df)

In [6]:
df

Unnamed: 0,external_name,internal_name,category_label,match,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,match_rating_comparison,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,apple iphone 8 plus 64gb silver,apple iphone 8 plus 64gb,Mobile Phones,1,7,7,7,0.924731,0.954839,1,...,100,87,100,95,87,87,1.0,0.693147,5.924256,5.924256
1,apple iphone 8 plus 64 gb spacegrau,apple iphone 8 plus 64gb,Mobile Phones,1,11,11,13,0.895238,0.937143,1,...,96,75,88,84,81,81,1.0,0.693147,5.828946,5.828946
2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,apple iphone 8 plus 64gb,Mobile Phones,1,46,46,63,0.655952,0.655952,1,...,83,51,100,86,51,51,0.4,0.336472,5.652489,4.736198
3,apple iphone 8 plus 64gb space grey,apple iphone 8 plus 64gb,Mobile Phones,1,11,11,11,0.895238,0.937143,1,...,100,81,100,95,81,81,1.0,0.693147,5.891644,5.891644
4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,apple iphone 8 plus 64gb,Mobile Phones,1,30,30,34,0.800926,0.880556,1,...,88,62,100,86,62,62,0.5,0.405465,5.743003,5.049856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388416,smeg fab28 60cm retro style right hand hinge f...,beko lx5053s integrated,Fridges,0,57,57,68,0.551530,0.551530,0,...,35,15,15,37,13,13,0.0,0.000000,4.356709,0.000000
388417,smeg fab28 60cm retro style left hand hinge fr...,lec r5511w white,Fridges,0,57,57,65,0.458838,0.458838,0,...,38,15,15,34,20,20,0.0,0.000000,4.477337,0.000000
388418,smeg fab28 60cm retro style left hand hinge fr...,bosch kir81af30g white,Fridges,0,54,54,66,0.547465,0.547465,0,...,27,11,11,31,13,13,0.0,0.000000,4.127134,0.000000
388419,candy 60cm built under larder fridge cru160nek,russell hobbs rhucf55b black,Fridges,0,39,39,44,0.493918,0.493918,0,...,22,24,24,25,19,19,0.0,0.000000,4.488636,0.000000


### Examine correlations

In [7]:
df[df.columns[1:]].corr()['match'][:].sort_values(ascending=False)

match                           1.000000
matching_numbers                0.780947
matching_numbers_log            0.780905
token_set_ratio                 0.721434
log_fuzz_score_numbers          0.710926
partial_ratio                   0.663716
jaro_winkler_similarity         0.647494
ratio                           0.597708
jaro_similarity                 0.596322
q_ratio                         0.594323
uq_ratio                        0.594312
token_sort_ratio                0.593347
log_fuzz_score                  0.585893
match_rating_comparison         0.477818
w_ratio                         0.470886
hamming_distance               -0.143411
damerau_levenshtein_distance   -0.145683
levenshtein_distance           -0.145896
Name: match, dtype: float64

### Create test and train data

In [8]:
X = df[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']].values
y = df['match'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [9]:
print (X_train.shape, X_test.shape)

(271894, 17) (116527, 17)


### Select the model

In [10]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "MLP": MLPClassifier(),
    "XGBClassifer tuned": XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1)
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision', 'recall','f1','roc','run_time','tp','fp', 'tn','fn'], dtype=object)

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results.head(10)









Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DummyClassifier_stratified,0.831713,0.168287,0.090615,0.091459,0.091036,0.499152,0.0,95935,9855,982,9755
1,KNeighborsClassifier,0.980279,0.019721,0.907248,0.875477,0.89108,0.933197,8.96,104829,961,9400,1337
2,XGBClassifier,0.986861,0.013139,0.954124,0.900717,0.926652,0.948161,1.58,105325,465,9671,1066
3,DecisionTreeClassifier,0.979361,0.020639,0.884601,0.892428,0.888497,0.940306,0.07,104540,1250,9582,1155
4,RandomForestClassifier,0.987162,0.012838,0.952148,0.906212,0.928612,0.950795,1.37,105301,489,9730,1007
5,AdaBoostClassifier,0.980623,0.019377,0.938645,0.844929,0.889325,0.919662,0.27,105197,593,9072,1665
6,GradientBoostingClassifier,0.983206,0.016794,0.945776,0.867468,0.904931,0.93121,1.22,105256,534,9314,1423
7,Perceptron,0.963416,0.036584,0.752614,0.898202,0.818989,0.934119,0.02,102620,3170,9644,1093
8,MLP,0.983763,0.016237,0.940883,0.879016,0.908898,0.936706,2.85,105197,593,9438,1299
9,XGBClassifer tuned,0.985368,0.014632,0.913629,0.92903,0.921265,0.960058,0.33,104847,943,9975,762


### Select and tune the best model

In [11]:
def get_scale_pos_weight(target, square_root=False, gridsearch=False):
    """Return the scale_pos_weight parameter for the XGBoost model when data are imbalanced.
    The scale_pos_weight parameter is calculated from the ratio of the negative class over
    the positive class. The exact scale_pos_weight sometimes does not give the best result,
    so by passing the gridsearch=True parameter you can return a list of values to test with
    GridSearchCV. In addition, passing square_root=True changes the scale_pos_weight to the
    square root value, which can sometimes be beneficial on extremely imbalanced data.

    :param target: Pandas dataframe column containing the binary target
    :param square_root: Optional boolean parameter to convert to square root on extremely unbalanced data
    :param gridsearch: Optional boolean parameter to return a bracketed list for use in GridSearchCV

    Usage:
        scale_pos_weight = get_scale_pos_weight(df['target'], square_root=False, gridsearch=True)

    """

    import math

    scale_pos_weight = round((len(target) - sum(target)) / sum(target))

    if square_root:
        scale_pos_weight = round(math.sqrt(scale_pos_weight))

    if gridsearch:
        scale_pos_weight = [scale_pos_weight-2, scale_pos_weight-1, scale_pos_weight, 
                            scale_pos_weight+1, scale_pos_weight+2]

    return scale_pos_weight

In [12]:
scale_pos_weight = get_scale_pos_weight(df['match'], square_root=False, gridsearch=True)
scale_pos_weight

[8, 9, 10, 11, 12]

In [13]:
n_estimators = [50]
learning_rate = [0.1]
max_depth = [5, 10, 20]
min_child_weight = [1, 2]
scale_pos_weight = [8, 9, 10, 11, 12]
gamma = [0.9, 1.0]
subsample = [0.9]
colsample_bytree = [0.8, 1.0]

start = time.perf_counter()

param_grid = dict(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                min_child_weight=min_child_weight,
                scale_pos_weight=scale_pos_weight,
                gamma=gamma,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
)

model = XGBClassifier(random_state=0)

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           )

print('Running GridSearchCV...')
best_model = grid_search.fit(X_train, y_train)
best_score = round(best_model.score(X_test, y_test), 4)
best_params = best_model.best_params_

print('Score:', best_score)
print('Optimum parameters', best_params)

finish = time.perf_counter()
run_time = (finish - start / 60)
print(f"Completed task in {run_time:0.4f} minutes")

Running GridSearchCV...




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Score: 0.9899
Optimum parameters {'colsample_bytree': 0.8, 'gamma': 0.9, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 1, 'n_estimators': 50, 'scale_pos_weight': 10, 'subsample': 0.9}
Completed task in 7911.8102 minutes


### Fit selected model

In [14]:
model = XGBClassifier(colsample_bytree=0.8,
                      gamma=0.9,
                      max_depth=20,
                      min_child_weight=1,
                      scale_pos_weight=12,
                      subsample=0.9,
                      n_estimators=50, 
                      learning_rate=0.1)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)





### Assess model performance

In [15]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.91      0.93      0.92     10737
   not match       0.99      0.99      0.99    105790

    accuracy                           0.99    116527
   macro avg       0.95      0.96      0.96    116527
weighted avg       0.99      0.99      0.99    116527



In [16]:
results = pd.DataFrame(data={'predictions': y_pred, 'actual': y_test})
results['result'] = np.where(results['predictions']==results['actual'], 1, 0)
results.head(20)

Unnamed: 0,predictions,actual,result
0,0,0,1
1,0,0,1
2,1,1,1
3,0,0,1
4,0,0,1
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,0,1


### Generate new predictions

In [17]:
def get_closest_matches(external_name):

    unique_internal_names = df['internal_name'].unique().tolist()
    closest_matches = process.extract(external_name, 
                  unique_internal_names, 
                  scorer=fuzz.token_set_ratio)

    return closest_matches

### Preprocessing the data

In [18]:
def prepare_data(external_name):

    closest_matches = get_closest_matches(external_name)

    df = pd.DataFrame(columns=['external_name', 'internal_name'])

    for match in closest_matches:
        row = {'external_name': external_name, 'internal_name': match[0]}
        df = df.append(row, ignore_index=True)

    return df

In [19]:
closest_data = prepare_data("apple iphone x")
closest_data.head()

Unnamed: 0,external_name,internal_name
0,apple iphone x,apple iphone x 64gb
1,apple iphone x,apple iphone x 256gb
2,apple iphone x,apple iphone 8 plus 64gb
3,apple iphone x,apple iphone 7 plus 32gb
4,apple iphone x,apple iphone 7 32gb


In [20]:
data = engineer_features(closest_data)
data = data[['levenshtein_distance', 'damerau_levenshtein_distance', 'hamming_distance',
       'jaro_similarity','jaro_winkler_similarity','matching_numbers_log',
       'matching_numbers','token_set_ratio','token_sort_ratio','partial_ratio',
       'ratio','log_fuzz_score','log_fuzz_score_numbers','match_rating_comparison',
       'q_ratio','uq_ratio','w_ratio']]

In [21]:
data.head()

Unnamed: 0,levenshtein_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,jaro_winkler_similarity,matching_numbers_log,matching_numbers,token_set_ratio,token_sort_ratio,partial_ratio,ratio,log_fuzz_score,log_fuzz_score_numbers,match_rating_comparison,q_ratio,uq_ratio,w_ratio
0,5,5,5,0.912281,0.947368,0.0,0.0,100,85,100,85,5.913503,0.0,1,85,85,95
1,6,6,6,0.9,0.94,0.0,0.0,100,82,100,82,5.897154,0.0,1,82,82,95
2,11,11,11,0.823413,0.894048,0.0,0.0,92,68,93,68,5.771441,0.0,1,68,68,86
3,11,11,11,0.823413,0.894048,0.0,0.0,92,68,93,68,5.771441,0.0,1,68,68,86
4,6,6,6,0.870927,0.922556,0.0,0.0,92,73,93,79,5.820083,0.0,1,79,79,87


### Generate the predictions

In [22]:
y_pred = model.predict_proba(data)[:,1]

In [23]:
data = data.assign(prediction=y_pred)
data = data.merge(closest_data)
data[['external_name','internal_name','prediction']].head()

Unnamed: 0,external_name,internal_name,prediction
0,apple iphone x,apple iphone x 64gb,0.995631
1,apple iphone x,apple iphone x 256gb,0.995631
2,apple iphone x,apple iphone 8 plus 64gb,0.035315
3,apple iphone x,apple iphone 7 plus 32gb,0.035315
4,apple iphone x,apple iphone 8 plus 64gb,0.035315
