> # 1. Read and Clean Data

Import neccessary modules:

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
import matplotlib as plt
import string
import re
import spacy
import random

Read and clean train dataset:

In [None]:
train=pd.read_csv("../input/foursquare-location-matching/train.csv")
#ordering by longitude because two close points are highly possible to share the same PoI.
train=train.sort_values(by="longitude")

In [None]:
def clean_data(train):
    col=["name","address","country","categories"]
    for i in col:
        #filling nan/blank values and removing punctuation
        train[i]=train[i].str.replace("[{}]".format(string.punctuation),'',regex=True).fillna("nan")
        train[i][train[i]==""]=0
        train[i][train[i]=="ERROR"]=0
        train[i]=train[i].astype("string")

    phone=[]
    for i in train["phone"]:
        #reserving only numbers
        i=str(i)
        i=re.sub(r"[^0-9]","",i)
        if i=="": i="0"
        phone.append(i)
    phone=np.array(phone)
    #deleting numbers that are too long and padding all up to 20 digits
    too_long=np.where(np.array([len(i) for i in phone])>20)[0]
    phone[too_long]=0
    train["phone"]=phone
    train["phone"]=train["phone"].map(lambda x: x.rjust(20,"0"))
    
    #noise because addresses comprised of these words can be hardly finded in real map.
    noise=train[train["address"].str.contains("高仿|微信|精仿")]
    train=train[~train["address"].str.contains("高仿|微信|精仿")]
    
    return train, noise

train, noise=clean_data(train)

In [None]:
#the desired data
print("train data shape:",train.shape)
train.head()

In [None]:
print("noise data shape:",noise.shape)
noise.head()

Embed words:

In [None]:
nlp=spacy.load("en_core_web_lg")
#we use google spaCy to embed words(only English is considered in this case).
def embed_fun(i,train_data=train):       
        x=train_data.iloc[i]
        
        feature=[]
        #we choose "name","country","categories"
        col_nlp=["name","country","categories"]
        col_numeric=["latitude","longitude"]
        
        feature.append(np.array(x["id"]))
        
        for k in col_nlp:
            word=nlp(x[k])
            feature.append(np.array(word.vector))
            
        lat_long=np.array([float(i) for i in x[col_numeric]])
        feature.append(lat_long)
        
        phone=np.array([[int(l)] for l in x["phone"]])
        feature.append(phone)
        
        feature.append(np.array(x["point_of_interest"]))

        return np.array(feature,dtype="object")

In [None]:
#writing them in momery so the program can run faster.
#For saving time, we just pre-embed them and save it in hard drive.
if False:
    t0=time.time()

    train_data=[]
    for i in range(600000):
        train_data.append(embed_fun(i,train_data=train))
        if i>0 and i%20000==0:
            print("train data-%s rows has finished"%i)
            print("spent time:","{:.4f}".format(time.time()-t0))

    path="embed_train_data"
    train_data=np.array(train_data,dtype="object")
    np.save(path,train_data)
    print("data is saved")

    print("total spent time:","{:.4f}".format(time.time()-t0))

In [None]:
if False:
    t0=time.time()

    test_data=[]
    for i in range(train.shape[0]-500000,train.shape[0]):
        test_data.append(embed_fun(i,train_data=train))
        if i>0 and i%20000==0:
            print("test data-%s rows has finished"%i)
            print("spent time:","{:.4f}".format(time.time()-t0))

    path="embed_test_data"
    test_data=np.array(test_data,dtype="object")
    np.save(path,test_data)
    print("data is saved")

    print("total spent time:","{:.4f}".format(time.time()-t0))

In [None]:
def create_date(j,embed_data,w=3):
    m=max(0,j-w)
    M=min(embed_data.shape[0],j+w+1)
    
#we set ["name","country","categories","latitude","longitude"] as features of a data point.
#list "features" is a group of data points we take into account
#anchor is the reference comparing with points in "features" so we can tell how similar those points are.

#discarding phone feature because we find out that the model tend to overfit this feature for higher payback
#if two data points have the same phone numbers, then they are almost surely the same.
#However, if their numbers are different, they still have possibility to share the same PoI.
    features=embed_data[m:M,1:5]
    anchor=embed_data[j,1:5]
        
    ID=embed_data[m:M,0]
    PoI=embed_data[m:M,-1]
    labels=[]
    for i in PoI:
        #if two data points both share the same PoI, then the label is 1 or is 0.
        if i==PoI[j-m]:labels.append([1])
        else:labels.append([0])
    labels=np.array(labels)
    
    return anchor, features, labels, ID

> # 2. Filter Data (Optional)

In [None]:
t0=time.time()
#If two points are distant, then they are less likely to have the same PoI.
#This method can identify the solitary points and thus reduce needed computing resources and time.
def neighbor(x,window):
    indices=[]
    m=x["longitude"]
    n=x["latitude"]
    for i in range(x.shape[0]):
        k1=1;k2=1;lst=[]
        if (i-k1>0):
            if (abs(m.iloc[i]-m.iloc[i-k1])<=window):
                if (abs(n.iloc[i]-n.iloc[i-k1])<=window):
                    lst.append(i-k1)
                    k1=k1+1
        if (i+k2<=(x.shape[0]-1)):
            if (abs(m.iloc[i]-m.iloc[i+k2])<=window):
                if (abs(n.iloc[i]-n.iloc[i+k2])<=window):
                    lst.append(i+k2)
                    k2=k2+1
        if len(lst)>0:lst.append(i)
        indices.append(lst)
    return indices

#When the difference of one point and its closest neighbors is higher than 0.1 longitude/latitude,
#then it's a solitary point(a point with an unique PoI).
#the tolerance is approximately 10~11 km in low latitude area and 6~8 km in high latitude area.
indices=neighbor(x=train,window=0.1)
count=[len(i) for i in indices]
indices=np.array(indices,dtype="object")
count=np.array(count)

print("spent time:","{:.4f}".format(time.time()-t0))

In [None]:
print("the number of total sample:",indices.shape[0])
print("the number of count:",count.shape[0])
print("the number of none-0 count:",count[count>0].shape[0])
print("max count:",count.max())

In [None]:
t0=time.time()
#however, due to noice, some points actually refer to the same PoI with other points even though they are far away.
#so we need to compute the false negitive rate and define a desired distance.

def FN_lst(x,count):
    false_negative_lst=[]
    for i in np.where(count==0)[0]:
        x=train.iloc[i]["point_of_interest"]
        x0=[]
        
        for j in [1,2]:
            if i-j>=0:
                x0.append(train.iloc[i-j]["point_of_interest"])
            else:x0.append("0")
            if i+j<=(train.shape[0]-1):
                x0.append(train.iloc[i+j]["point_of_interest"])
            else:x0.append("0")

        if x in x0:false_negative_lst.append(1)
        else:false_negative_lst.append(0)

    false_negative_lst=np.array(false_negative_lst)
    return false_negative_lst

false_negative_lst=FN_lst(x=train,count=count)
false_negative=false_negative_lst.sum()/false_negative_lst.shape[0]

print("spent time:","{:.4f}".format(time.time()-t0))

In [None]:
#suspended, just for noting
if False:
    t0=time.time()

    def IoU_lst(x,count,indices):
        IoU=[]
        for i in np.where(count>0)[0]:
            x=train.iloc[i]["point_of_interest"]
            length=len(indices[i])
            count=0
            for j in indices[i]:
                if x==train.iloc[j]["point_of_interest"]:count=count+1
            IoU.append(count/length)
        IoU=np.array(IoU)
        return IoU

    IoU=IoU_lst(x=train,count=count,indices=indices)

    print("spent time:","{:.4f}".format(time.time()-t0))

The number of false negitive points is 22k. It's about 2% but filters more than a quarter of total dataset.
If we choose larger distance *(w)*, then we reduce both the false negitive and filtered data points; therefore, there is a tradeoff between the number of false negitive and dataset size.

In [None]:
print("false negative Rate:","{:.2%}".format(false_negative))
print("false negative Rate (in whole train data set):","{:.2%}".format(np.where(false_negative_lst==1)[0].shape[0]/train.shape[0]))
print("the number of false negitive:",np.where(false_negative_lst==1)[0].shape[0])
#print("average IoU:","{:.2%}".format(IoU.mean()))
#print("IoU not less than 50%:",np.where(IoU>=0.5)[0].shape[0])

> # 3. Siamese Network

Beacuse PoIs are multiple and various, it's not a good idea to set all as a label vector. However, we don't need to specify each PoI for each data point. All we need is to tell whether two or more points share the same one.

But sometimes the data structure can be complicated like this one, we have NLP embedded words (name, counrty, categories) and also numeric features. The question for any two data points, how "close" is close and how to define the weights across different features.

Luckly, Siamese Network is the answer. It's a special structure of DNN; the main idea of Siamese Network is to tell how similar two data points are. When a data point (e.g., photos, sentences) went through a network and features were generated, if two data points have an identical label, then those feature must be very "close". So we can create a specific network that connect two (either different or identical) networks and then calculate the closeness of data points.

In [None]:
from IPython.display import Image
print("source:https://www.youtube.com/watch?v=4S-XDefSjTM")
Image("../input/images/2022-05-17 8.06.36.png")

* ## Baseline model
The first step is to create a network that can extract features from our row dataset. Because google spaCy has already finished the job of word embedding, we only need to process the phone numbers.

In [None]:
def BaseLine():
    input_name=tf.keras.Input(shape=(300),name="name")
    output_name=input_name
    
    input_country=tf.keras.Input(shape=(300),name="country")
    output_country=input_country
    
    input_categories=tf.keras.Input(shape=(300),name="categories")
    output_categories=input_categories
    
    input_lat_long=tf.keras.Input(shape=(2),name="input_lat_long")
    output_lat_long=input_lat_long
    
    model=tf.keras.Model(inputs=[input_name,input_country,input_categories,input_lat_long],
                         outputs=[output_name,output_country,output_categories,output_lat_long])
    return model

baseline=BaseLine()
baseline.summary()

The baseline model is a small network. We just simply pass feature name, country, categories and lat_long.

In [None]:
tf.keras.utils.plot_model(baseline)

* ## Siamese Network
The final step of building our model is to create a lambda layer that connects two baseline models. Lambda layer is actually a square error function.

In [None]:
def siamese_model(baseline):
    def difference_square(vec):
        x, y=vec
        output=(x-y)**2 #penalty function one can also try others like absolute error.
        return output

    anchor_name=tf.keras.Input(shape=(300),name="anchor_name")
    anchor_country=tf.keras.Input(shape=(300),name="anchor_country")
    anchor_categories=tf.keras.Input(shape=(300),name="anchor_categories")
    anchor_lat_long=tf.keras.Input(shape=(2),name="anchor_lat_long")

    input_name=tf.keras.Input(shape=(300),name="name")
    input_country=tf.keras.Input(shape=(300),name="country")
    input_categories=tf.keras.Input(shape=(300),name="categories")
    input_lat_long=tf.keras.Input(shape=(2),name="lat_long")

    inputs1=baseline([anchor_name,anchor_country,anchor_categories,anchor_lat_long])
    inputs2=baseline([input_name,input_country,input_categories,input_lat_long])

    x0=tf.keras.layers.Lambda(difference_square,name="diff_name")([inputs1[0],inputs2[0]])
    x1=tf.keras.layers.Lambda(difference_square,name="diff_country")([inputs1[1],inputs2[1]])
    x2=tf.keras.layers.Lambda(difference_square,name="diff_categories")([inputs1[2],inputs2[2]])
    x3=tf.keras.layers.Lambda(difference_square,name="diff_lat_long")([inputs1[3],inputs2[3]])

    #linear combination works best 
    #because we need the layers accurately respond the difference between two data points rather than "twist" it.
    x0=tf.keras.layers.Dense(150)(x0) 
    x0=tf.keras.layers.Dense(1,name="score_name")(x0)

    x1=tf.keras.layers.Dense(150)(x1)
    x1=tf.keras.layers.Dense(1,activation="tanh",name="score_country")(x1)

    x2=tf.keras.layers.Dense(150)(x2)
    x2=tf.keras.layers.Dense(1,name="score_categories")(x2)
    
    #hyperbolic function is to suppress it so the model won't pay too much attention on.
    x3=tf.keras.layers.Dense(1)(x3)
    x3=tf.keras.layers.Dense(1,activation="tanh",name="score_lat_long")(x3)

    x=tf.keras.layers.Concatenate()([x0,x1,x2,x3])
    x=tf.keras.layers.Dense(4)(x)
    #residual layer
    res=tf.keras.layers.Dense(4,activation="relu")(x)
    x=tf.keras.layers.Add()([x,res])
    x=tf.keras.layers.Dense(2)(x)
    res=tf.keras.layers.Dense(2,activation="relu")(x)
    x=tf.keras.layers.Add()([x,res])
    output=tf.keras.layers.Dense(1,activation="sigmoid",name="score")(x)
    
    model=tf.keras.Model(
        inputs=[anchor_name,anchor_country,anchor_categories,anchor_lat_long,
               input_name,input_country,input_categories,input_lat_long],
        outputs=output
    )
    return model

model=siamese_model(baseline)
#model.load_weights("../input/siamese-network-weights/siamese_model")
model.summary()

As following, the complete network has 8 inputs (4 from anchor and the 4 from other samples) and 1 output. Selecting a sample each time and passing through a lambda layer with anchor, we therefore get the penalty score of two data points. As sending to dense layers, we can have sum-up scores of feature penalties and finally, get the similarity of two sample.

In [None]:
tf.keras.utils.plot_model(model)

> # 4. Training and Testing Data

For two points that are distant, there is no need to compare and we only consider anchor's neighborhood. In following training, we select a data point as an anchor and its seven closest points (including itself) as a training group.

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall')]
)

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')==1):
            self.model.stop_training=True
callbacks = myCallback()

accuracy=[]
precision=[]
recall=[]

In [None]:
train_data=np.load("../input/foursquare-embedding-train-and-test-data/embed_train_data.npy",allow_pickle=True)

In [None]:
lst=random.choices(range(train_data.shape[0]),k=40000) 
#it's just for saving time, we already had trained this model.

t0=time.time()
for l in range(len(lst)):
    anchor, features, labels, ID=create_date(lst[l],train_data,w=3)
    k=len(labels)
    #because the anchor itself is also in neighbor group, there must be at least one "1" label.
    inputs=[
            tf.constant([anchor[0]]*k),tf.constant([anchor[1]]*k),tf.constant([anchor[2]]*k),tf.constant([anchor[3]]*k),
            tf.constant(list(features[:,0])),tf.constant(list(features[:,1])),tf.constant(list(features[:,2])),tf.constant(list(features[:,3]))
        ]                                                        
    history=model.fit(x=inputs,y=labels,epochs=20,verbose=0,callbacks=[callbacks])

    accuracy.append(history.history["accuracy"])
    precision.append(history.history["precision"])
    recall.append(history.history["recall"])
    
    if l>0 and l%2000==0:
        print("%sth group is finished"%l)
        print("spent time:","{:.4f}".format(time.time()-t0))
    if l>0 and l%10000==0:
        model.save_weights("siamese_model")

print("total spent time:","{:.4f}".format(time.time()-t0))
model.save_weights("siamese_model")
#del train_data

In [None]:
Acc=np.array([i[-1] for i in accuracy])
Pre=np.array([i[-1] for i in precision])
Rec=np.array([i[-1] for i in recall])

def moving_average(a, n=300):
    ret = np.cumsum(a)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n-1:]/n

In [None]:
print("average accuracy:","{:.4f}%".format(Acc.mean()*100))
print("average precision:","{:.4f}%".format(Pre.mean()*100))
print("average recall:","{:.4f}%".format(Rec.mean()*100))

fig, axe=plt.pyplot.subplots(3,1,figsize=(18,15))

axe[0].set_title("moving average(300)-accuracy")
axe[0].set_xticks(np.arange(0,len(moving_average(Acc)),5000))
axe[0].plot(moving_average(Acc))

axe[1].set_title("moving average(300)-precision")
axe[1].set_xticks(np.arange(0,len(moving_average(Pre)),5000))
axe[1].plot(moving_average(Pre),"g")

axe[2].set_title("moving average(300)-recall")
axe[2].set_xticks(np.arange(0,len(moving_average(Rec)),5000))
axe[2].plot(moving_average(Rec),"m")
fig.show()

Test our model.

In [None]:
test_accuracy=[]
test_precision=[]
test_recall=[]

test_data=np.load("../input/foursquare-embedding-train-and-test-data/embed_test_data.npy",allow_pickle=True)

t0=time.time()
for l in range(test_data.shape[0]):
    anchor, features, labels, ID=create_date(l,train_data,w=3)
    k=len(labels)
    inputs=[
            tf.constant([anchor[0]]*k),tf.constant([anchor[1]]*k),tf.constant([anchor[2]]*k),tf.constant([anchor[3]]*k),
            tf.constant(list(features[:,0])),tf.constant(list(features[:,1])),tf.constant(list(features[:,2])),tf.constant(list(features[:,3]))
        ]  
    lst=model(inputs).numpy()


    BinaryAccuracy=tf.keras.metrics.BinaryAccuracy()
    Precision=tf.keras.metrics.Precision()
    Recall=tf.keras.metrics.Recall()

    BinaryAccuracy.update_state(labels, lst)
    test_accuracy.append(BinaryAccuracy.result().numpy())

    Precision.update_state(labels, lst)
    test_precision.append(Precision.result().numpy())

    Recall.update_state(labels, lst)
    test_recall.append(Recall.result().numpy())
    
    if l>0 and l%2000==0:
        print("%sth group is finished"%l)
        print("spent time:","{:.4f}".format(time.time()-t0))

print("total spent time:","{:.4f}".format(time.time()-t0))

del test_data

In [None]:
test_accuracy=np.array(test_accuracy)
test_precision=np.array(test_precision)
test_recall=np.array(test_recall)

print("average accuracy:",test_accuracy.mean())
print("average precision:",test_precision.mean())
print("average recall:",test_recall.mean())

fig, axe=plt.pyplot.subplots(3,1,figsize=(18,15))

axe[0].set_title("moving average(300)-test accuracy")
axe[0].set_xticks(np.arange(0,len(moving_average(test_accuracy,n=300)),10000))
axe[0].plot(moving_average(test_accuracy))

axe[1].set_title("moving average(300)-test precision")
axe[1].set_xticks(np.arange(0,len(moving_average(test_precision,n=300)),10000))
axe[1].plot(moving_average(test_precision),"g")

axe[2].set_title("moving average(300)-test recall")
axe[2].set_xticks(np.arange(0,len(moving_average(test_recall,n=300)),10000))
axe[2].plot(moving_average(test_recall),"m")
fig.show()