# Linking articles between editions

In [None]:
from neural_searcher import NeuralSearcher
from qdrant_client import QdrantClient
from tqdm.notebook import tqdm
import json
import random

e1 = 'e1'
e2 = 'e2'
MATCH_THRESHOLD = 0.0


In [None]:
# Start Qdrant client
client = QdrantClient(host="localhost", port=6333)

In [None]:
# To track progress
total_entries = client.count(e1).count

# Fetching entries
vectors = []
batch_size = 50 # This value has to be chosen carefully (Experience from testing)
offset = None

with tqdm(total=total_entries, desc="Fetching entries") as pbar:
    while(True):
        response = client.scroll(
            collection_name=e1, 
            with_payload=True, 
            with_vectors=True, 
            limit=batch_size,
            offset=offset
            )
        records = response[0]
        offset = response[1]
        vectors += records
        pbar.update(len(records))  # Update progress bar
        if len(records) < batch_size:
            break

In [None]:
neural_searcher_e2 = NeuralSearcher(collection_name=e2)

# Create links from entry ids of e1 to e2
# And create training data (This is on)
text_links = []
links_e1_to_e2 = {}
for entry in tqdm(vectors, desc="Linking entries"):
    entry_id = entry.payload['entryid']
    text = entry.payload['text']
    matches = neural_searcher_e2.vector_search(entry.vector, threshold=MATCH_THRESHOLD) # Can search with treshold instead
    # Run matches through a NN perhaps
    if matches:
        links_e1_to_e2[entry_id] = matches[0]['entryid'] # Changing the search limit to 1 would make it a lot faster
        text_links.append((text, matches[0]['text']))
# Create links from entry ids of e2 to e1
links_e2_to_e1 = {value: key for key, value in links_e1_to_e2.items()}



### Make test data for deciding neural search threshold 

In [None]:
data_limit = 200
data = []
samples = random.sample(text_links, data_limit)

for sample in samples:
    item = {
        "e1_text": sample[0],
        "e2_text": sample[1],
        "valid_match": 1
    }
    data.append(item)
    
with open('text_links.json', 'w', encoding='utf-8') as outfile:
    json.dump(data, outfile, ensure_ascii=False, indent=4)
    

### Test threshold against test data

In [None]:
with open('text_links.json', 'r', encoding='utf-8') as test_file:
    json_items = json.loads(test_file.read())

test_threshold = 0.8
nr_matches = 0
nr_false_positives = 0
nr_false_negatives = 0
for item in tqdm(json_items, desc='Verifying matches'):
    e1_text = item['e1_text']
    e2_text = item['e2_text']
    match = neural_searcher_e2.string_search(item['e1_text'], threshold=test_threshold) # String search
    if match:
        nr_matches += 1
        if item['valid_match'] == 0:
            nr_false_positives = 0
            print(f'Expected no match: \"{e1_text}\" : \"{e2_text}\"')
    else: 
        if item['valid_match'] == 1:
            nr_false_negatives += 1
            print(f'Expected match: \"{e1_text}\" : \"{e2_text}\"')

print(f"Made {nr_matches} of {len(json_items)} entries")
print(f"Number of false positives: {nr_false_positives}")
print(f"Number of false negatives: {nr_false_negatives}")

In [None]:
# Function to make linked json files
def write_linked_json(in_name: str, out_name: str, links_dict: dict[str, str]) -> None:
    with open(in_name, 'r', encoding='utf-8') as infile:
        json_items = json.loads(infile.read())
    
    data = []
    for item in json_items:
        item['second_edition_key'] = links_dict.get(item['entryid'], "")
        data.append(item)
        
    with open(out_name, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

In [None]:
write_linked_json('e1.json', 'e1_linked.json', links_e1_to_e2)
print("Finished writing e1_linked.json")
write_linked_json('e2.json', 'e2_linked.json', links_e2_to_e1)
print("Finished writing e2_linked.json")

In [None]:

# Go through e1
#   for every article, search for c

# for each article in e2, compare text to qdrant e1, get closest matches
# For the closest match, calculate cosine similarity, compare headword, edit-distance, 
# other features, threshold function.
# If match, change in e1 and e2, other edition key to match