In [4]:
import json
import pandas as pd
from sentence_transformers.util import cos_sim  
from sentence_transformers import SentenceTransformer as SBert
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from numpy import sqrt
import numpy as np
from tqdm import trange
import os
# modelName = 'princeton-nlp/sup-simcse-bert-base-uncased'
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

dataName1 = "Ctrip"
dataName2 = "Mafengwo"

with open("../train_data/%s_addr.json"%dataName1, "r", encoding="utf-8") as f:
    data1 = json.load(f)
with open("../train_data/%s_addr.json"%dataName2, "r", encoding="utf-8") as f:
    data2 = json.load(f)
print(len(data1), len(data2))

positive, negative = [], []
matched = set()
# ! Attention !
# Since this 'matched' set runs through the entire file to prevent the extraction of 
# duplicate entity pairs, you cannot run a single block of code repeatedly.
# If you want to repeatedly run a certain code block,
# you need to run it from the first code block (which is where it is now) one by one
# to ensure the correctness of the matched set.

def combineEnt(d1, d2, label):
    rnt = {"id": "%s-%s"%(d1["id"], d2["id"])}
    rnt["label"] = label
    for attr in list(d1.keys())[1:]:
        rnt[attr] = [d1[attr], d2[attr]]
    return rnt

1080 259


In [3]:
# Do word vector calculations on all occurrences of strings in advance 
# to reduce the time complexity of subsequent processes
if os.path.exists("temp/pre_dict.json"):
    print("Preprocessing dictionary file already exists!")
else:
    eDict = {}
    cols = ["Name", "Address", "District", "RoadInfo", "Poi", "RoomInfo"]
    data = data1 + data2
    for i in trange(len(data)):
        d = data[i]
        for col in cols:
            target = d[col]
            if target == None or target == "": continue
            if target not in eDict:
                eDict[target] = model.encode(target).tolist()
    with open("./temp/pre_dict.json", "w", encoding="utf-8") as f:
        json.dump(eDict, f, ensure_ascii=False, indent=2)

100%|██████████| 1339/1339 [00:33<00:00, 40.18it/s]


In [None]:
# Name - Generate similar top 200

if os.path.exists("temp/name.csv"):
    print("Top 200 file (Name) already exist!")
else:
    with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
        eDict = json.load(f)
    lst_name = []
    for i in trange(len(data1)):
        d1 = data1[i]
        for d2 in data2:
            name1 = d1["Name"]; name2 = d2["Name"]
            if name1 == "" or name2 == "": continue
            cossim = 1 - cosine(eDict[name1], eDict[name2])
            lst_name.append([d1["id"]+"-"+d2["id"], name1, name2, cossim])
    lst_name = sorted(lst_name, key=lambda x: x[3], reverse=True)
    lst_name = lst_name[:200]
    df_name = pd.DataFrame(lst_name, columns=["id", "Name1", "Name2", "Cos_sim_name"])
    df_name.to_csv("./temp/name.csv", encoding="utf-8")
    # Next, in the CSV file generated above, create a new label column with 1/0 representing match/mismatch

In [5]:
# Name - label and extract the reuslt

with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
    eDict = json.load(f)
df_name = pd.read_csv("./temp/name.csv", encoding="utf-8")
namePos, nameNeg = [], []
if "label" not in df_name.columns:
    print("Please label the CSV file generated above first!")
else:
    for i in range(len(df_name)):
        id = df_name.loc[i, "id"].split("-")
        id1, id2 = int(id[0][1:]), int(id[1][1:])
        d1, d2 = data1[id1], data2[id2]
        if d1["Name"] != df_name.loc[i, "Name1"] or d2["Name"] != df_name.loc[i, "Name2"]:
            print("There was an error in the process of responding to the dataset! Please check")
        matched.add(d1["id"]+d2["id"])
        if df_name.loc[i, "label"] == 1:
            namePos.append(combineEnt(d1, d2, True))
        else:
            nameNeg.append(combineEnt(d1, d2, False))

In [5]:
# Distance - Generate similar top 200

if os.path.exists("temp/distance.csv"):
    print("Top 200 file (Distance) already exist!")
else:
    from geopy.distance import geodesic
    with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
        eDict = json.load(f)
    lst_distance = []
    for i in trange(len(data1)):
        d1 = data1[i]
        for d2 in data2:
            name1 = d1["Name"]; name2 = d2["Name"]
            loc1 = d1["Location"]; loc2 = d2["Location"]
            if loc1[0] is None or loc2[0] is None: continue
            if d1["id"] + d2["id"] in matched: continue
            distance = geodesic((loc1[1], loc1[0]), (loc2[1], loc2[0])).meters
            if distance == 0: continue  # Identical coordinates may be incorrect values
            lst_distance.append([d1["id"]+"-"+d2["id"], name1, name2, distance])
    lst_distance = sorted(lst_distance, key=lambda x: x[3], reverse=False)
    lst_distance = lst_distance[:200]
    df_distance = pd.DataFrame(lst_distance, columns=["id", "Name1", "Name2", "Distance"])
    df_distance.to_csv("./temp/distance.csv", encoding="utf-8")
    # Next, in the CSV file generated above, create a new label column with 1/0 representing match/mismatch

距离Top200文件已经存在！


In [6]:
# Distance - label and extract the reuslt

with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
    eDict = json.load(f)
df_distance = pd.read_csv("./temp/distance.csv", encoding="utf-8")
distancePos, distanceNeg = [], []
if "label" not in df_distance.columns:
    print("Please label the CSV file generated above first!")
else:
    for i in range(len(df_distance)):
        id = df_distance.loc[i, "id"].split("-")
        id1, id2 = int(id[0][1:]), int(id[1][1:])
        d1, d2 = data1[id1], data2[id2]
        if d1["Name"] != df_distance.loc[i, "Name1"] or d2["Name"] != df_distance.loc[i, "Name2"]:
            print("There was an error in the process of responding to the dataset! Please check")
        matched.add(d1["id"]+d2["id"])
        if df_distance.loc[i, "label"] == 1:
            distancePos.append(combineEnt(d1, d2, True))
        else:
            distanceNeg.append(combineEnt(d1, d2, False))

In [8]:
# Road - Generate similar top 200

if os.path.exists("temp/road.csv"):
    print("Top 200 file (Road) already exist!")
else:
    with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
        eDict = json.load(f)
    lst_road = []
    for i in trange(len(data1)):
        d1 = data1[i]
        for d2 in data2:
            name1 = d1["Name"]; name2 = d2["Name"]
            road1 = d1["RoadInfo"]; road2 = d2["RoadInfo"]
            if road1 == "" or road2 == "": continue
            if d1["id"] + d2["id"] in matched: continue
            cossim = 1 - cosine(eDict[road1], eDict[road2])
            lst_road.append([d1["id"]+"-"+d2["id"], name1, name2, road1, road2, cossim])
    lst_road = sorted(lst_road, key=lambda x: x[5], reverse=True)
    lst_road = lst_road[:100]
    df_road = pd.DataFrame(lst_road, columns=["id", "Name1", "Name2", "Road1", "Road2", "Cos_sim_road"])
    df_road.to_csv("./temp/road.csv", encoding="utf-8")
    # Next, in the CSV file generated above, create a new label column with 1/0 representing match/mismatch

100%|██████████| 1080/1080 [00:04<00:00, 253.19it/s]


In [7]:
# Road - label and extract the reuslt

with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
    eDict = json.load(f)
df_road = pd.read_csv("./temp/road.csv", encoding="utf-8")
roadPos, roadNeg = [], []
if "label" not in df_road.columns:
    print("Please label the CSV file generated above first!")
else:
    for i in range(len(df_road)):
        id = df_road.loc[i, "id"].split("-")
        id1, id2 = int(id[0][1:]), int(id[1][1:])
        d1, d2 = data1[id1], data2[id2]
        if d1["Name"] != df_road.loc[i, "Name1"] or d2["Name"] != df_road.loc[i, "Name2"]:
            print("There was an error in the process of responding to the dataset! Please check")
        matched.add(d1["id"]+d2["id"])
        if df_road.loc[i, "label"] == 1:
            roadPos.append(combineEnt(d1, d2, True))
        else:
            roadNeg.append(combineEnt(d1, d2, False))

In [11]:
# Room - Generate similar top 200

if os.path.exists("temp/room.csv"):
    print("Top 200 file (Room) already exist!")
else:
    with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
        eDict = json.load(f)
    lst_room = []
    for i in trange(len(data1)):
        d1 = data1[i]
        for d2 in data2:
            name1 = d1["Name"]; name2 = d2["Name"]
            room1 = d1["RoomInfo"]; room2 = d2["RoomInfo"]
            if room1 == "" or room2 == "": continue
            if d1["id"] + d2["id"] in matched: continue
            cossim = 1 - cosine(eDict[room1], eDict[room2])
            lst_room.append([d1["id"]+"-"+d2["id"], name1, name2, room1, room2, cossim])
    lst_room = sorted(lst_room, key=lambda x: x[5], reverse=True)
    lst_room = lst_room[:20]
    df_room = pd.DataFrame(lst_room, columns=["id", "Name1", "Name2", "Room1", "Room2", "Cos_sim_room"])
    df_room.to_csv("./temp/room.csv", encoding="utf-8")
    # Next, in the CSV file generated above, create a new label column with 1/0 representing match/mismatch

100%|██████████| 1080/1080 [00:00<00:00, 9290.03it/s]


In [8]:
# Room - label and extract the reuslt

with open("./temp/pre_dict.json", "r", encoding="utf-8") as f:
    eDict = json.load(f)
df_room = pd.read_csv("./temp/room.csv", encoding="utf-8")
roomPos, roomNeg = [], []
if "label" not in df_room.columns:
    print("Please label the CSV file generated above first!"")
else:
    for i in range(len(df_room)):
        id = df_room.loc[i, "id"].split("-")
        id1, id2 = int(id[0][1:]), int(id[1][1:])
        d1, d2 = data1[id1], data2[id2]
        if d1["Name"] != df_room.loc[i, "Name1"] or d2["Name"] != df_room.loc[i, "Name2"]:
            print("There was an error in the process of responding to the dataset! Please check")
        matched.add(d1["id"]+d2["id"])
        if df_room.loc[i, "label"] == 1:
            roomPos.append(combineEnt(d1, d2, True))
        else:
            roomNeg.append(combineEnt(d1, d2, False))

In [10]:
# Collect positive and negative data

positive = namePos + distancePos + roadPos + roomPos
negative = nameNeg + distanceNeg + roadNeg + roomNeg
print(len(positive), len(negative))

import random
random.seed(1023)
random.shuffle(positive)
random.shuffle(negative)

with open("../train_data/positive.json", "w", encoding="utf-8") as f:
    json.dump(positive, f, ensure_ascii=False, indent=2)
with open("../train_data/negative.json", "w", encoding="utf-8") as f:
    json.dump(negative, f, ensure_ascii=False, indent=2)
# print(len(namePos))
# print(len(distancePos))
# print(len(roadPos))
# print(len(roomPos))

160 560
