In [1]:
!pip install -U sentence-transformers




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import ast
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load CSV data
file_path_facts = "fact_checks.csv"
file_path_posts = "posts.csv"

data_facts = pd.read_csv(file_path_facts)
data_posts = pd.read_csv(file_path_posts, sep=",")

# Fill missing data
data_posts['ocr'] = data_posts['ocr'].fillna("[]")
data_facts['claim'] = data_facts['claim'].fillna("[]")
data_posts['text'] = data_posts['text'].fillna("")

# Safely apply ast.literal_eval to convert string lists to actual lists
def safe_literal_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return []  

data_posts['ocr'] = data_posts['ocr'].apply(safe_literal_eval)
data_facts['claim'] = data_facts['claim'].apply(safe_literal_eval)

# Extract OCR-based posts as (post_id, ocr_text)
posts_list = [(post_id, item[0]) for post_id, ocr in zip(data_posts['post_id'], data_posts['ocr']) for item in ocr]
post_ids_ocr, post_texts_ocr = zip(*posts_list) if posts_list else ([], [])

# Extract fact-check data as (fact_check_id, claim_text)
facts_list = [(row['fact_check_id'], row['claim'][0]) for _, row in data_facts.iterrows() if row['claim']]
fact_check_ids, fact_texts = zip(*facts_list) if facts_list else ([], [])

# Create mapping from fact_check_id to title
fact_id_to_title = pd.Series(data_facts.title.values, index=data_facts.fact_check_id).to_dict()

# Initialize list to store CSV rows
rows = []

# Process posts if there are fact checks
if fact_texts:
    # Initialize sentence transformer model
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    
    # Encode all fact claims
    fact_embeddings = model.encode(fact_texts)
    
    # Process OCR-based posts
    if post_texts_ocr:
        # Encode OCR texts in batch
        post_embeddings_ocr = model.encode(post_texts_ocr)
        # Calculate all similarities at once
        ocr_similarities = cosine_similarity(post_embeddings_ocr, fact_embeddings)
        
        for i, (post_id, ocr_text) in enumerate(zip(post_ids_ocr, post_texts_ocr)):
            similarities = ocr_similarities[i]
            top_k_indices = similarities.argsort()[-10:][::-1]
            for idx in top_k_indices:
                fact_id = fact_check_ids[idx]
                rows.append({
                    'post_id': post_id,
                    'ocr_text': ocr_text,
                    'fact_check_id': fact_id,
                    'title': fact_id_to_title.get(fact_id, ""),
                    'similarity': similarities[idx]
                })
    
    # Process posts without OCR (use 'text' column)
    processed_post_ids = set(post_ids_ocr)
    all_post_ids = set(data_posts['post_id'])
    unprocessed_post_ids = list(all_post_ids - processed_post_ids)
    
    if unprocessed_post_ids:
        unprocessed_data = data_posts[data_posts['post_id'].isin(unprocessed_post_ids)]
        post_texts = unprocessed_data['text'].tolist()
        
        # Encode text posts in batch
        post_embeddings_text = model.encode(post_texts)
        text_similarities = cosine_similarity(post_embeddings_text, fact_embeddings)
        
        for i, (post_id, text) in enumerate(zip(unprocessed_data['post_id'], post_texts)):
            similarities = text_similarities[i]
            top_k_indices = similarities.argsort()[-10:][::-1]
            for idx in top_k_indices:
                fact_id = fact_check_ids[idx]
                rows.append({
                    'post_id': post_id,
                    'ocr_text': text,
                    'fact_check_id': fact_id,
                    'title': fact_id_to_title.get(fact_id, ""),
                    'similarity': similarities[idx]
                })

# Create DataFrame and save to CSV
if rows:
    df = pd.DataFrame(rows)
    df = df.sort_values(by=['post_id', 'similarity'], ascending=[True, False])
    df.to_csv("post_fact_matches.csv", index=False)
    print("CSV file generated: post_fact_matches.csv")
else:
    print("No matches found. CSV file not created.")

  from .autonotebook import tqdm as notebook_tqdm


CSV file generated: post_fact_matches.csv


In [3]:
import json
import csv

# File paths
json_file = "monolingual_predictions.json"
csv_file = "post_fact_matches.csv"

# Load the JSON data
with open(json_file, "r", encoding="utf-8") as jf:
    monolingual_data = json.load(jf)

# Load the CSV data
csv_data = []
with open(csv_file, "r", encoding="utf-8") as cf:
    reader = csv.DictReader(cf)
    for row in reader:
        csv_data.append(row)

# Populate the JSON data with fact_check_id lists (convert to integers)
for post_id in monolingual_data.keys():
    fact_check_ids = [
        int(row["fact_check_id"]) for row in csv_data if row["post_id"] == post_id
    ]
    monolingual_data[post_id] = fact_check_ids

# Write the updated JSON data to a new file in one line
with open(json_file, "w", encoding="utf-8") as output_file:
    json.dump(monolingual_data, output_file, ensure_ascii=False, separators=(", ", ": "))

print(f"Updated JSON written to {json_file}")


Updated JSON written to monolingual_predictions.json


In [None]:
import zipfile

# File paths to include in the zip
files_to_zip = ["monolingual_predictions.json"]  
output_zipfile = "predictions.zip"

# Create a zip file and add the specified files
with zipfile.ZipFile(output_zipfile, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file in files_to_zip:
        zipf.write(file, arcname=file) 

print(f"Created {output_zipfile} containing: {', '.join(files_to_zip)}")


Created predictions.zip containing: monolingual_predictions.json
