As the master dataset (final_trainset.csv) was generated most likely script-generated, we wanted to verify the matches to avoid "hallucinations." This code looks up the raw text for specific Child-Parent pairs (random number, currenlty 100). It places the original news titles side-by-side with the CBS titles, so we can manually verify if the "matches" are legitimate semantic links or just coincidente.

In [1]:
import pandas as pd
import random
import os
from tqdm import tqdm

# Loading data
df_map = pd.read_csv('final_trainset.csv')
df_map['child_id'] = df_map['child_id'].fillna(0).astype(int)
df_map['parent_id'] = df_map['parent_id'].fillna(0).astype(int)

df_matches = df_map[df_map['match'] == 1]
df_parents = pd.read_csv('data/all_parents.csv').drop_duplicates(subset='id').set_index('id')

# search scope variable:
target_count = 100
results = []
found_count = 0

# shuffling the data to get random samples
random_matches = df_matches.sample(frac=1)

# For loop that essentially takes each match id and puts them side by side to create a csv we can later verify.
for _, row in random_matches.iterrows():
    if found_count >= target_count:
        break
        
    cid = int(row['child_id'])
    pid = int(row['parent_id'])
    target_file = f"data/c_{cid}.csv"
    
    if os.path.exists(target_file):
        try:
            c_df = pd.read_csv(target_file)
            c_title = c_df.iloc[0]['title'] if 'title' in c_df.columns else c_df.iloc[0]['titel']
            
            if pid in df_parents.index:
                p_title = df_parents.loc[pid, 'title']
                
                results.append({
                    'child_id': cid,
                    'parent_id': pid,
                    'child_title': c_title,
                    'parent_title': p_title,
                    'csv_similarity': row.get('content_similarity', 'N/A')
                })
                found_count += 1
                
                # print every 10 to show progress (for higher numbers)
                if found_count % 10 == 0:
                    print(f"Found {found_count}/100.")
                    
        except:
            continue

if results:
    verification_df = pd.DataFrame(results)
    verification_df.to_csv('match_verification_report.csv', index=False)
else:
    print("No files found, path probably wrong.")

Found 10/100.
Found 20/100.
Found 30/100.
Found 40/100.
Found 50/100.
Found 60/100.
Found 70/100.
Found 80/100.
Found 90/100.
Found 100/100.
