## Review each transcription

In [122]:
import os
import json
from tqdm.auto import tqdm

base_path = '../output'
candidate_folders = ['bullrich', 'massa', 'milei', 'schiaretti', 'bregman']

# store all transcribed phrases for each candidate
transcriptions = {candidate: [] for candidate in candidate_folders}

# let's write a function that takes the 'segments' array and the candidate's name as inputs and returns the desired long text.

def generate_long_text(segments, candidate_name):
    long_text = []
    for segment in segments:
        speaker = "Host" if not segment['is_candidate'] else candidate_name
        text = f"{speaker}: {segment['text'].strip()}"
        long_text.append(text)
    return long_text

for candidate in candidate_folders:
    folder_path = os.path.join(base_path, candidate)

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.json'):
            filepath = os.path.join(folder_path, filename)

            with open(filepath, 'r') as f:
                content = json.load(f)
                
                # Generate full text if segments are available
                segments = content.get('segments', []) if isinstance(content, dict) else []
                full_text = generate_long_text(segments, candidate)
                content['full_text'] = ' '.join(full_text)

                # Add full filepath
                content['filepath'] = filepath

                if isinstance(content, dict):
                    transcriptions[candidate].append(content)
                elif isinstance(content, list):
                    # Note: This assumes that the list contains dict items
                    for entry in content:
                        segments = entry.get('segments', [])
                        full_text = generate_long_text(segments, candidate)
                        entry['full_text'] = ' '.join(full_text)
                        entry['filepath'] = filepath
                    transcriptions[candidate].extend(content)

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/263 [00:00<?, ?it/s]

  0%|          | 0/1026 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/112 [00:00<?, ?it/s]

In [123]:
for each in candidate_folders:
    print(f"candidate: {each}, len: {len(transcriptions[each])}")

candidate: bullrich, len: 219
candidate: massa, len: 263
candidate: milei, len: 1025
candidate: schiaretti, len: 90
candidate: bregman, len: 111


In [66]:
candidate_name_test = "milei"
print(transcriptions[candidate_name_test][100]['full_text'][:1000])

Host: Buenas tardes a todos, buenas tardes a Vedra. Host: ¿Cómo están? Host: Gracias a todos por venir. Host: Esta es una de las excepciones de esta campaña, que es tener la oportunidad de escuchar a un candidato dando clases, enseñándonos sobre lo que es su materia de conocimiento. Host: Les quiero agradecer a todos que ustedes vengan acá a escucharnos, Javier, que estén presentes, que estén apoyando a la Libertad Avanza. Host: Este es un camino que comenzó mucho antes del 12 de septiembre con las militancias que llevamos cada uno en nuestros temas y que se va a coronar el 14 de noviembre con la ayuda de todos ustedes. Host: Por esa razón quiero pedirles que nos ayuden a fiscalizar, a cuidar los votos, Host: a que no nos roben, a que se escuche la voluntad ciudadana, a que se escuche nuestra voz y nuestras ideas. Host: No quiero que seamos la Argentina silenciada, no quiero que seamos la Argentina invisible, están nosotros que nos escuchen, que sepan lo que pensamos los argentinos, qu

## Now build a FAISS index, do a similarity search, and remove duplicates

In [67]:
# !conda install -c conda-forge faiss-gpu --y
# !pip install -U sentence-transformers

First, let's write a function to vectorize the text segments. We'll use sentence transformers for this example, but you can plug in whatever method you're comfortable with.

In [68]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def vectorize_segments(segments):
    texts = [segment['text'].strip() for segment in segments]
    return model.encode(texts)

Now, let's install and import FAISS, and create a function to build an index.

In [69]:
import faiss

# def build_faiss_index(vectors):
#     dimension = vectors.shape[1]
#     index = faiss.IndexFlatL2(dimension)
#     index.add(vectors)
#     return index

def build_faiss_index(vectors):
    dimension = vectors.shape[1]
    index = faiss.IndexFlatL2(dimension)
    
    # Move to GPU
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    
    gpu_index.add(vectors)
    return gpu_index

Next, a function to find and remove duplicates:

In [98]:
import numpy as np

# def remove_duplicates(candidate_segments):
#     # Step 1: Vectorize all segments
#     vectors = vectorize_segments(candidate_segments)

#     # Step 2: Build FAISS index
#     index = build_faiss_index(np.array(vectors))

#     # Step 3: Query to find duplicates
#     unique_segments = []
#     threshold_distance = 0.1  # You can tune this

#     for i, vec in enumerate(vectors):
#         vec = np.expand_dims(vec, axis=0)
#         distances, indices = index.search(vec, 2)  # 2 because the query vector itself will always be returned
#         if distances[0][1] > threshold_distance:
#             unique_segments.append(candidate_segments[i])

#     return unique_segments

def vectorize_full_texts(full_texts):
    return model.encode(full_texts)

def remove_duplicates(candidate_entries):
    # Extract full_texts and vectorize them
    full_texts = [entry.get('full_text', '') for entry in candidate_entries]
    print(f"Debug: Number of full_texts: {len(full_texts)}")  # Debug

    vectors = vectorize_full_texts(full_texts)
    print(f"Debug: Number of vectors: {len(vectors)}")  # Debug
    vectors = np.array(vectors)

    # Build FAISS index
    index = build_faiss_index(vectors)

    # Query to find duplicates
    threshold_distance = 0.5  # Tune this
    D, I = index.search(vectors, 2)

    unique_entries = []
    added_indices = set()

    for i, (distances, indices) in enumerate(zip(D, I)):
        print(f"Debug: distances={distances}, indices={indices}")  # Debug

        if i not in added_indices:
            if distances[1] > threshold_distance:
                unique_entries.append(candidate_entries[i])
                added_indices.add(i)
            else:
                if indices[1] not in added_indices:
                    added_indices.add(indices[1])

    print(f"Debug: Number of unique_entries: {len(unique_entries)}")  # Debug
    return unique_entries

Finally, let's modify your main loop to remove duplicates:

In [100]:
for each in candidate_folders:
    print(f"candidate: {each}, len: {len(transcriptions_unique[each])}")

candidate: bullrich, len: 0
candidate: massa, len: 0
candidate: milei, len: 0
candidate: schiaretti, len: 0
candidate: bregman, len: 0


In [125]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm.auto import tqdm
import json
import os

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to vectorize full texts
def vectorize_full_texts(full_texts):
    return model.encode(full_texts)

# Function to build a FAISS index
def build_faiss_index(vectors):
    if len(vectors) == 0:
        print("Warning: Empty vectors array.")
        return None
    try:
        dimension = vectors.shape[1]
    except IndexError as e:
        print(f"Error: Unexpected vectors shape {vectors.shape}. Expected a 2D array.")
        return None
    index = faiss.IndexFlatL2(dimension)
    index.add(vectors)
    return index

# Function to remove duplicates
def remove_duplicates(transcriptions):
    unique_transcriptions = {}
    
    for candidate, entries in transcriptions.items():
        print(f"Processing {candidate}")
        
        # Debug: Check if entries are populated
        if len(entries) == 0:
            print(f"Warning: No entries for {candidate}.")
            continue

        # Vectorize full texts
        full_texts = [entry['full_text'] for entry in entries]

        # Debug: Print first few full_texts
        # print(f"Debug: First few full_texts for {candidate}: {full_texts[:3]}")
        
        vectors = vectorize_full_texts(full_texts)
        
        # Debug: Print first few vectors
        # print(f"Debug: First few vectors for {candidate}: {vectors[:3]}")
        
        # Build FAISS index
        index = build_faiss_index(np.array(vectors))

        # Skip if index is None (usually means empty vectors array)
        if index is None:
            print(f"Skipping {candidate} due to empty index.")
            continue
        
        # Search for duplicates
        D, I = index.search(vectors, 2)
        
        unique_entries = []
        added_indices = set()
        
        for i, (distances, indices) in enumerate(zip(D, I)):
            if i not in added_indices:
                unique_entries.append(entries[i])
                added_indices.add(i)
                added_indices.add(indices[1])
                
        unique_transcriptions[candidate] = unique_entries
    
    return unique_transcriptions

# Now remove duplicates
transcriptions_unique = remove_duplicates(transcriptions)

Processing bullrich
Processing massa
Processing milei
Processing schiaretti
Processing bregman


In [127]:
for each in candidate_folders:
    print(f"candidate: {each}, len: {len(transcriptions_unique[each])}")

candidate: bullrich, len: 159
candidate: massa, len: 177
candidate: milei, len: 752
candidate: schiaretti, len: 61
candidate: bregman, len: 79


### Original
- candidate: bullrich, len: 219
- candidate: massa, len: 254
- candidate: milei, len: 1022
- candidate: schiaretti, len: 87
- candidate: bregman, len: 111

In [130]:
import shutil
from pathlib import Path

def copy_unique_files(transcriptions_unique, output_folder="../output/unique"):
    """
    Copy unique transcription files to a new folder.

    Parameters:
    - transcriptions_unique: Dictionary containing unique transcriptions.
    - output_folder: The root folder where unique files will be stored.
    """
    for candidate, entries in transcriptions_unique.items():
        candidate_unique_folder = Path(output_folder) / candidate

        # Create candidate's unique folder if it doesn't exist
        candidate_unique_folder.mkdir(parents=True, exist_ok=True)

        for entry in entries:
            src_path = entry['filepath']
            file_name = Path(src_path).name
            dest_path = candidate_unique_folder / file_name

            # Copy the file
            shutil.copy(src_path, dest_path)

# Usage
# Assuming transcriptions_unique is the dictionary containing unique transcriptions for each candidate
copy_unique_files(transcriptions_unique)