In [2]:
import json

In [10]:
# Load score_by_entity and find all vector_indices
with open("score_by_entity.json", encoding="utf-8") as file:
    data = json.load(file)

data[0].keys()

dict_keys(['id_interno', 'entidad', 'pais', 'score', 'vector_index', 'total_preguntas', 'data_hash_origen'])

In [11]:
ent_indices = {d["entidad"]: d["vector_index"] for d in data}
print(len(ent_indices))
target_keys = list(ent_indices.values())

6711


In [12]:
import numpy as np

# --- 1. Define File Paths and Target Keys ---
# NOTE: Replace 'data\state_b8d56cb4aa70_hs.npz' with your actual input file path
INPUT_NPZ_PATH = 'state_b8d56cb4aa70_hs.npz' 
OUTPUT_NPZ_PATH = 'filtered_vectors.npz'


# --- 2. Load the Original Data ---
try:
    with np.load(INPUT_NPZ_PATH) as data:
        # Load the critical arrays
        norm_reps = data['representations_normalized']
        row_indices = data['row_indices']        
        
        # --- 3. Find the positions (indices) to filter ---
        
        # Convert the target_keys list to a NumPy array for efficient comparison
        target_keys_np = np.array(target_keys) 
        
        # Use np.isin to create a boolean mask: 
        # True where row_indices value is present in target_keys_np
        filter_mask = np.isin(row_indices, target_keys_np)
        
        # Count how many vectors we are keeping
        num_kept = np.sum(filter_mask)
        print(f"Number of vectors to be kept (matching keys): {num_kept}")
        
        # --- 4. Filter the Arrays using the Mask ---
        
        # Apply the mask to all relevant arrays. The mask must be 1-dimensional 
        # and match the first dimension of the arrays being filtered.
        filtered_norm_reps = norm_reps[filter_mask]
        filtered_row_indices = row_indices[filter_mask]
                        
        # --- 5. Save the Filtered Data to a New .npz File ---
        np.savez(
            OUTPUT_NPZ_PATH,
            representations_normalized=filtered_norm_reps,
            row_indices=filtered_row_indices
        )
        
        print("-" * 40)
        print(f"✅ Success! Filtered data saved to: '{OUTPUT_NPZ_PATH}'")
        
except FileNotFoundError:
    print(f"Error: The file '{INPUT_NPZ_PATH}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Number of vectors to be kept (matching keys): 6711
----------------------------------------
✅ Success! Filtered data saved to: 'filtered_vectors.npz'
