### Goal: identify any relationship between Levenshtein distance and the distance between image vectors

* I wanted to see if a shorter distance between two molecules' image vectors correlates with a lower Levenshtein distance between the molecules InChI and vice versa.
* I've checked several metrics used to measure distance between vectors: Euclidean, cosine, minkowski, dice, etc.

See results on a small set below


In [None]:
from __future__ import absolute_import
from __future__ import print_function

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import Levenshtein
import cv2
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns
from numpy import save, load
import warnings
warnings.filterwarnings('ignore')

from scipy.spatial import distance
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.python.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

from scipy import spatial

print(os.listdir("../input"))

In [None]:
# Levenshtein scoring function

def get_score(y_true, y_pred):
    scores = []
    for true, pred in zip(y_true, y_pred):
        score = Levenshtein.distance(true, pred)
        scores.append(score)
    avg_score = np.mean(scores)
    return avg_score

print(get_score('alex', 'axel'))
print(Levenshtein.distance('alex', 'axel'))

# Data

In [None]:
nrows = 1100

rawDF = pd.read_csv('../input/bms-arranged-label/arranged_bms_train_labels.csv', index_col=[0], nrows=nrows)
play = rawDF[['image_path','InChI']][:1000]
playTest = rawDF[['image_path','InChI']][1000:]
print(play.shape)
play.head()

In [None]:
playTest.reset_index(drop=True, inplace=True)
print(playTest.shape)
playTest.head()

# Features extraction ... from images to vectors

In [None]:
%%time

# Model predict = feat extract from images going thru VGG16
# 10k images = 1Gb features.npy ... 7:30 mins prep feat ... 9:50 mins predict ... average on test Lev = 78.4

model = VGG16(weights='imagenet', include_top=False)

features = []

for row in tqdm(range(play.shape[0])):
    img_path = play.image_path[row]
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    #features.append(model.predict(x).squeeze())
    features.append(model.predict(x).ravel())

features = np.array(features)
features.shape

### Save / Load features npy

#SAVE
save('features.npy', features)

%%time

#LOAD
features = load('../input/bms-euclidean-to-levenshtein/features.npy')
features.shape

# Predict = find the nearest image vector

In [None]:
%%time

# Predict ONE img w Euclidean np.linalg.norm

imgNum = 23

model = VGG16(weights='imagenet', include_top=False)

img_path = playTest.image_path[imgNum]
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

#featOne = model.predict(x)
featOne = model.predict(x).ravel()

# Find the min Euclidean distance to one image vector

MyDist = []

for v in range(features.shape[0]):
    #Dist = np.linalg.norm(features[v]-featOne)
    Dist = spatial.distance.euclidean(features[v],featOne)   
    MyDist.append(Dist)
    
MyDist=np.array(MyDist)  

# When the image we're searching with is in the features already - take the second smallest dist
# as 0 is the first arg = same vector we search and dist = 0
#nearVec = MyDist.argsort()[1] 
#vecDis = np.amin(MyDist[MyDist != np.amin(MyDist)]) # for picking the second smallest

nearVec = MyDist.argmin()
vecDis = MyDist.min()
levDis = Levenshtein.distance(playTest.InChI[imgNum], play.InChI[nearVec])


print('get_score ', get_score(playTest.InChI[imgNum], play.InChI[nearVec]))
print('levDis ',levDis)
print('vecDis ',vecDis)

In [None]:
print(playTest.shape)
playTest.head()

In [None]:
%%time

# Predict on many img
model = VGG16(weights='imagenet', include_top=False)
realInchi = []
predInchi = []

LevDist = []
VecDist = []

for imgNum in tqdm(range(playTest.shape[0])):

    img_path = playTest.image_path[imgNum]
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    #featOne = model.predict(x)
    featOne = model.predict(x).ravel()

    # Find the min Euclidean distance to one image vector
    MyDist = []
    for v in range(features.shape[0]):
        
        #Dist = np.linalg.norm(features[v]-featOne)
        #Dist = spatial.distance.cosine(features[v],featOne) 
        #Dist = spatial.distance.braycurtis(features[v],featOne) 
        #Dist = spatial.distance.canberra(features[v],featOne) 
        #Dist = spatial.distance.chebyshev(features[v],featOne) # Takes 10 mins for 100 samples...
        #Dist = spatial.distance.cityblock(features[v],featOne)
        #Dist = spatial.distance.correlation(features[v],featOne)
        #Dist = spatial.distance.jensenshannon(features[v],featOne)
        #Dist = spatial.distance.minkowski(features[v],featOne)
        #Dist = spatial.distance.dice(features[v],featOne)
        #Dist = spatial.distance.hamming(features[v],featOne)
        #Dist = spatial.distance.jaccard(features[v],featOne)
        Dist = spatial.distance.kulsinski(features[v],featOne)
        #Dist = spatial.distance.rogerstanimoto(features[v],featOne)
        #Dist = spatial.distance.russellrao(features[v],featOne)
        #Dist = spatial.distance.sokalmichener(features[v],featOne)
        #Dist = spatial.distance.sokalsneath(features[v],featOne)
        #Dist = spatial.distance.yule(features[v],featOne)
        
        MyDist.append(Dist)
    MyDist=np.array(MyDist)  
    nearVec = MyDist.argmin()
    vecDis = MyDist.min()

    realInchi.append(playTest.InChI[imgNum])
    predInchi.append(play.InChI[nearVec])

    levDis = Levenshtein.distance(playTest.InChI[imgNum], play.InChI[nearVec])
    LevDist.append(levDis)
    VecDist.append(vecDis)


print('Average Lev dist ',get_score(realInchi, predInchi))


In [None]:
plt.scatter(VecDist,LevDist)
plt.xlabel("Dist between Vectors ")
plt.ylabel("Dist between InChI - Levenshtein ")
plt.title("Relationship between Levenshtein and Vector distances")

# Results of Levenshtein average on the test w the metrics:

* vectors (7,7,512) with np.linalg.norm Lev = 83.1
* vectors flattened with np.linalg.norm Lev = 83.1
* scipy spatial distance euclidean Lev = 83.1
* cosine Lev = 79.4
* braycurtis ... 79.2
* canberra ... 81.9
* cityblock ... 84.5
* correlation ... 79.4
* jensenshannon ... 78.5 
* minkowski ... 83.1
* dice ... 77.4
* hamming ... 92.4
* jaccard ... 78.7
* **kulsinski ... 75.7**
* rogerstanimoto ... 76.8
* russellrao ... 76.7
* sokalmichener ... 76.8
* sokalsneath ... 77.4
* yule ... 170
