# Linking articles between editions

In [None]:
import os
os.chdir('../../')
print(os.getcwd())

from utils.neural_searcher import NeuralSearcher
from utils.paths import *
from utils import json_helpers as jh
from qdrant_client import QdrantClient
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
import random

e1 = 'e1'
e2 = 'e2'
e1_json = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e1'
e2_json = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e2'
MATCH_THRESHOLD = 0.92


In [None]:
# Start Qdrant client
client = QdrantClient(host="localhost", port=6333)
neural_searcher_e2 = NeuralSearcher(collection_name=e2)

In [None]:
# To track progress
total_entries = client.count(e1).count

# Fetching entries
vectors = []
batch_size = 50 # This value has to be chosen carefully (Experience from testing)
offset = None

with tqdm(total=total_entries, desc="Fetching entries") as pbar:
    while(True):
        response = client.scroll(
            collection_name=e1, 
            with_payload=True, 
            with_vectors=True, 
            limit=batch_size,
            offset=offset
            )
        records = response[0]
        offset = response[1]
        vectors += records
        pbar.update(len(records))  # Update progress bar
        if len(records) < batch_size:
            break

In [None]:

# Create links from entry ids of e1 to e2
# And create training data (This is on)
text_links = []
links_e1_to_e2 = {}
for entry in tqdm(vectors, desc="Linking entries"):
    entry_id = entry.payload['entryid']
    text = entry.payload['text']
    matches = neural_searcher_e2.vector_search(entry.vector, threshold=MATCH_THRESHOLD, search_limit=1) # Can search with treshold instead
    # Run matches through a NN perhaps
    if matches:
        links_e1_to_e2[entry_id] = matches[0]['entryid'] 
        text_links.append((text, matches[0]['text']))
# Create links from entry ids of e2 to e1
links_e2_to_e1 = {value: key for key, value in links_e1_to_e2.items()}



### Make test data for deciding neural search threshold 

In [None]:
data_limit = 200
data = []
samples = random.sample(text_links, data_limit)

for sample in samples:
    item = {
        "e1_text": sample[0],
        "e2_text": sample[1],
        "valid_match": 1
    }
    data.append(item)

jh.write_items(data, 'text_links')
    

### Test threshold against test data

In [None]:
json_items = jh.read_items('text_links')

test_threshold = 0.92 # 0.92 feels best, 0.89 best for Macro-f1: 0.7926
y = []
y_pred = []
nbr_matches = 0
for item in tqdm(json_items, desc='Verifying matches'):
    e1_text = item['e1_text']
    e2_text = item['e2_text']
    y.append(item['valid_match'])
    match = neural_searcher_e2.string_search(item['e1_text'], threshold=test_threshold) # String search
    if match:
        nbr_matches += 1
        y_pred.append(1)
        if item['valid_match'] == 0:
            print(f'Expected no match: \"{e1_text}\" : \"{e2_text}\"')
    else: 
        y_pred.append(0)
        if item['valid_match'] == 1:
            print(f'Expected match: \"{e1_text}\" : \"{e2_text}\"')

print(f"Made {nbr_matches} of {len(json_items)} entries")
print()
print(classification_report(y, y_pred, target_names=['Not Match', 'Match']))
print('Micro F1:', f1_score(y, y_pred, average='micro'))
print('Macro F1', f1_score(y, y_pred, average='macro'))
print('Accuracy', accuracy_score(y, y_pred))
print(confusion_matrix(y, y_pred))

In [None]:
# Function to make linked json files
def write_linked_json(in_name: str, out_name: str, links_dict: dict[str, str], other_edition_key: str) -> None:
    entries = jh.read_items(in_name)

    for item in entries:
        item[other_edition_key] = links_dict.get(item['entryid'], "")

    jh.write_items(entries, out_name)

In [None]:
write_linked_json(e1_json, e1_json, links_e1_to_e2, 'e2_key')
print(f"Finished writing {e1_json}.json")
write_linked_json(e2_json, e1_json, links_e2_to_e1, 'e1_key')
print(f"Finished writing {e2_json}.json")

In [None]:

# Go through e1
#   for every article, search for c

# for each article in e2, compare text to qdrant e1, get closest matches
# For the closest match, calculate cosine similarity, compare headword, edit-distance, 
# other features, threshold function.
# If match, change in e1 and e2, other edition key to match