# Linking articles between editions

In [107]:
from neural_searcher import NeuralSearcher
from qdrant_client import QdrantClient
from tqdm import tqdm
import json

e1 = 'e1'
e2 = 'e2'


In [None]:
# Start Qdrant client
client = QdrantClient(host="localhost", port=6333)

In [91]:
# Fetching entries
vectors = []
batch_size = 50 # This value has to be chosen carefully (Experience from testing)
offset = None
while(True):
    response = client.scroll(
        collection_name=e1, 
        with_payload=True, 
        with_vectors=True, 
        limit=batch_size,
        offset=offset
        )
    records = response[0]
    offset = response[1]
    vectors += records
    if len(records) < batch_size:
        print("All entries fetched.")
        break

All entries fetched.


In [105]:
neural_searcher_e2 = NeuralSearcher(collection_name=e2)

# Create links from entry ids of e1 to e2
links_e1_to_e2 = {}
for entry in tqdm(vectors):
    entry_id = entry.payload['entryid']
    matches = neural_searcher_e2.search(entry.vector) # Can search with treshold instead
    # Run matches through a NN perhaps
    if matches:
        links_e1_to_e2[entry_id] = matches[0]['entryid'] # Changing the search limit to 1 would make it a lot faster

# Create links from entry ids of e2 to e1
links_e2_to_e1 = {value: key for key, value in links_e1_to_e2.items()}

100%|██████████| 81369/81369 [20:36<00:00, 65.79it/s]


In [123]:
# Function to make linked json files
def write_linked_json(in_name: str, out_name: str, links_dict: dict[str, str]) -> None:
    with open(in_name, 'r', encoding='utf-8') as infile:
        json_items = json.loads(infile.read())
    
    data = []
    for item in json_items:
        item['second_edition_key'] = links_dict.get(item['entryid'], "")
        data.append(item)
        
    with open(out_name, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

In [125]:
write_linked_json('e1.json', 'e1_linked.json', links_e1_to_e2)
print("Finished writing e1_linked.json")
write_linked_json('e2.json', 'e2_linked.json', links_e2_to_e1)
print("Finished writing e2_linked.json")

Finished writing e1_linked.json
Finished writing e2_linked.json


In [None]:

# Go through e1
#   for every article, search for c

# for each article in e2, compare text to qdrant e1, get closest matches
# For the closest match, calculate cosine similarity, compare headword, edit-distance, 
# other features, threshold function.
# If match, change in e1 and e2, other edition key to match