For details, please visit https://www.kaggle.com/code/jimkaihuang/location-matching-siamese-network.

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import tensorflow as tf
import time
import spacy
import string
import re

In [None]:
def clean_data(train):
    col=["name","address","country","categories"]
    for i in col:
        #filling nan/blank values and removing punctuation
        train[i]=train[i].str.replace("[{}]".format(string.punctuation),'',regex=True).fillna("nan")
        train[i][train[i]==""]=0
        train[i][train[i]=="ERROR"]=0
        train[i]=train[i].astype("string")
    
    #noise because addresses comprised of these words can be hardly finded in real map.
    noise=train[train["address"].str.contains("高仿|微信|精仿")]
    train=train[~train["address"].str.contains("高仿|微信|精仿")]
    
    return train, noise


In [None]:
nlp=spacy.load("en_core_web_lg")
def embed_test_fun(i,test_data):       
        x=test_data.iloc[i]
        
        feature=[]
        #we choose "name","country","categories"
        col_nlp=["name","country","categories"]
        col_numeric=["latitude","longitude"]
        
        feature.append(np.array(x["id"]))
        
        for k in col_nlp:
            word=nlp(x[k])
            feature.append(np.array(word.vector))
            
        lat_long=np.array([float(i) for i in x[col_numeric]])
        feature.append(lat_long)
        
        return np.array(feature,dtype="object")
    
def create_test_date(j,embed_data,w=3):
    m=max(0,j-w)
    M=min(embed_data.shape[0],j+w+1)
    
    features=embed_data[m:M,1:]
    anchor=embed_data[j,1:]
        
    ID=embed_data[m:M,0]
    
    return anchor, features, ID

In [None]:
def BaseLine():
    input_name=tf.keras.Input(shape=(300),name="name")
    output_name=input_name
    
    input_country=tf.keras.Input(shape=(300),name="country")
    output_country=input_country
    
    input_categories=tf.keras.Input(shape=(300),name="categories")
    output_categories=input_categories
    
    input_lat_long=tf.keras.Input(shape=(2),name="input_lat_long")
    output_lat_long=input_lat_long
    
    model=tf.keras.Model(inputs=[input_name,input_country,input_categories,input_lat_long],
                         outputs=[output_name,output_country,output_categories,output_lat_long])
    return model

baseline=BaseLine()
baseline.summary()

In [None]:
def siamese_model(baseline):
    def difference_square(vec):
        x, y=vec
        output=(x-y)**2 #penalty function one can also try others like absolute error.
        return output

    anchor_name=tf.keras.Input(shape=(300),name="anchor_name")
    anchor_country=tf.keras.Input(shape=(300),name="anchor_country")
    anchor_categories=tf.keras.Input(shape=(300),name="anchor_categories")
    anchor_lat_long=tf.keras.Input(shape=(2),name="anchor_lat_long")

    input_name=tf.keras.Input(shape=(300),name="name")
    input_country=tf.keras.Input(shape=(300),name="country")
    input_categories=tf.keras.Input(shape=(300),name="categories")
    input_lat_long=tf.keras.Input(shape=(2),name="lat_long")

    inputs1=baseline([anchor_name,anchor_country,anchor_categories,anchor_lat_long])
    inputs2=baseline([input_name,input_country,input_categories,input_lat_long])

    x0=tf.keras.layers.Lambda(difference_square,name="diff_name")([inputs1[0],inputs2[0]])
    x1=tf.keras.layers.Lambda(difference_square,name="diff_country")([inputs1[1],inputs2[1]])
    x2=tf.keras.layers.Lambda(difference_square,name="diff_categories")([inputs1[2],inputs2[2]])
    x3=tf.keras.layers.Lambda(difference_square,name="diff_lat_long")([inputs1[3],inputs2[3]])

    #linear combination works best 
    #because we need the layers accurately respond the difference between two data points rather than "twist" it.
    x0=tf.keras.layers.Dense(150)(x0) 
    x0=tf.keras.layers.Dense(1,name="score_name")(x0)

    x1=tf.keras.layers.Dense(150)(x1)
    x1=tf.keras.layers.Dense(1,activation="tanh",name="score_country")(x1)

    x2=tf.keras.layers.Dense(150)(x2)
    x2=tf.keras.layers.Dense(1,name="score_categories")(x2)
    
    #hyperbolic function is to suppress it so the model won't pay too much attention on.
    x3=tf.keras.layers.Dense(1)(x3)
    x3=tf.keras.layers.Dense(1,activation="tanh",name="score_lat_long")(x3)

    x=tf.keras.layers.Concatenate()([x0,x1,x2,x3])
    x=tf.keras.layers.Dense(4)(x)
    #residual layer
    res=tf.keras.layers.Dense(4,activation="relu")(x)
    x=tf.keras.layers.Add()([x,res])
    x=tf.keras.layers.Dense(2)(x)
    res=tf.keras.layers.Dense(2,activation="relu")(x)
    x=tf.keras.layers.Add()([x,res])
    output=tf.keras.layers.Dense(1,activation="sigmoid",name="score")(x)
    
    model=tf.keras.Model(
        inputs=[anchor_name,anchor_country,anchor_categories,anchor_lat_long,
               input_name,input_country,input_categories,input_lat_long],
        outputs=output
    )
    return model

model=siamese_model(baseline)
model.load_weights("../input/siamese-network-weights/siamese_model")
model.summary()

In [None]:
test=pd.read_csv("../input/foursquare-location-matching/test.csv")
test=test.sort_values(by="longitude")
test, noise=clean_data(test)

In [None]:
data=[]
for i in range(test.shape[0]):
    data.append(embed_test_fun(i,test))
data=np.array(data)

In [None]:
submission=dict()
if len(noise)>0:
    for i in range(len(noise)):
        submission[noise.iloc[i]["id"]]=noise.iloc[i]["id"]

In [None]:
def Submission(test,submission,W):
    
    for J in range(test.shape[0]):
        anchor, features, ID=create_test_date(J,test,w=3)
        k=len(ID)
        inputs=[
                tf.constant([anchor[0]]*k),tf.constant([anchor[1]]*k),tf.constant([anchor[2]]*k),tf.constant([anchor[3]]*k),
                tf.constant(list(features[:,0])),tf.constant(list(features[:,1])),tf.constant(list(features[:,2])),tf.constant(list(features[:,3]))
            ]  
        lst=model(inputs).numpy()

        if J-W>0:
            tmp1=[str(ID[W])]
        else: tmp1=[str(ID[J])]

        for i in range(len(lst)):
            if lst[i]>0.5 and ID[i]!=tmp1[0]:tmp1.append(str(ID[i]))
        submission[tmp1[0]]=" ".join(tmp1)
    submission=pd.DataFrame(submission,index=[0]).T.reset_index()
    submission.columns=["id","matches"]
    submission=submission.sort_values(by="id")
    submission=submission.reset_index(drop=True)
    return submission

t0=time.time()
submission=Submission(test=data,submission=submission,W=3)
print("total spent time:","{:.4f}".format(time.time()-t0))

submission.to_csv("submission.csv", index=False)


In [None]:
submission

In [None]:
import re
regex = r"(?i)(?<=\d)(?=[a-z])|(?<=[a-z])(?=\d)"
test_str = "Law Lecture Theatres L1 – 5 G20"
result=re.sub(regex," ", test_str)

In [None]:
regex = r"([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))"
re.sub(regex,r"\1 ", result)