In [9]:
import gzip
import requests
from tqdm.notebook import tqdm

# URL of the OpenSubtitles English file
url = "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en.txt.gz"
output_txt_file = "OpenSubtitles_en.txt"
compressed_file = "en.txt.gz"

# Download the file if not already present
import os
if not os.path.exists(compressed_file):
    print("Downloading OpenSubtitles English file...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(compressed_file, 'wb') as f, tqdm(
        desc="Downloading",
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            f.write(data)
            bar.update(len(data))
    print("Download complete!")

In [10]:
import gzip
from tqdm.notebook import tqdm

compressed_file = "en.txt.gz"
output_txt_file = "OpenSubtitles_en_1M.txt"
max_sentences = 1_000_000

unique_sentences = set()
count = 0

print("Processing file and collecting 1M unique sentences...")

with gzip.open(compressed_file, 'rt', encoding='utf-8', errors='ignore') as fin, \
     open(output_txt_file, 'w', encoding='utf-8') as fout:

    for line in tqdm(fin, desc="Reading lines"):
        line = line.strip()
        # just keep short sentences
        if len(line) > 28:
            continue
        if line and line not in unique_sentences:
            unique_sentences.add(line)
            fout.write(line + "\n")
            count += 1

            # Stop once we reach 20M unique sentences
            if count >= max_sentences:
                break

print(f"Finished! Collected {count} unique sentences into {output_txt_file}")


Processing file and collecting 1M unique sentences...


Reading lines: 0it [00:00, ?it/s]

Finished! Collected 1000000 unique sentences into OpenSubtitles_en_1M.txt


In [11]:
# Sort sentences
output_txt_file = "OpenSubtitles_en_1M_sorted.txt"
sorted_sentences = sorted(unique_sentences)
# Save to file
with open(output_txt_file, 'w', encoding='utf-8') as fout:
    for sentence in tqdm(sorted_sentences, desc="Writing sorted sentences"):
        fout.write(sentence + "\n")


Writing sorted sentences:   0%|          | 0/1000000 [00:00<?, ?it/s]

In [12]:
import numpy as np
import torch
import threading
import time
from queue import Queue
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# Parameters
input_txt_file = "OpenSubtitles_en_1M_sorted.txt"
output_memmap_file = "OpenSubtitles_en_1M_emb_64.dat"
max_sentences = 1_000_000
embedding_dim = 64
num_threads = 128
model_batch_size = 1024
thread_batch_size = model_batch_size * 16

# Step 1: Create memmap
embeddings_memmap = np.memmap(output_memmap_file, dtype='float32', mode='w+', shape=(max_sentences, embedding_dim))

# Step 2: Load model once (shared)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = SentenceTransformer('minishlab/potion-base-2M', device=device)

# Step 4: Thread worker
def worker(thread_id, queue):
    while True:
        item = queue.get()
        if item is None:
            break
        batch_sentences, batch_indices = item

        # Compute embeddings
        embeddings_64 = model.encode(batch_sentences, convert_to_numpy=True, batch_size = model_batch_size, show_progress_bar=False)

        # Save to memmap
        embeddings_memmap[batch_indices, :] = embeddings_64
        embeddings_memmap.flush()
        queue.task_done()

# Step 5: Create queue and threads
queue = Queue(maxsize=num_threads*2)
threads = []
for i in range(num_threads):
    t = threading.Thread(target=worker, args=(i, queue))
    t.start()
    threads.append(t)

# Step 6: Read sentences and enqueue in batches
sentences_batch = []
batch_indices = []

with open(input_txt_file, 'r', encoding='utf-8') as fin:
    for idx, line in enumerate(tqdm(fin, total=max_sentences, desc="Reading sentences")):
        if idx >= max_sentences:
            break
        sentence = line.strip()
        if sentence:
            sentences_batch.append(sentence)
            batch_indices.append(idx)

        if len(sentences_batch) >= thread_batch_size:
            queue.put((sentences_batch, batch_indices))
            sentences_batch = []
            batch_indices = []

# Enqueue remaining sentences
if sentences_batch:
    queue.put((sentences_batch, batch_indices))

# Step 7: Stop threads
for _ in threads:
    queue.put(None)
# queue.join()
time.sleep(60)
for t in threads:
    t.join()

# Step 8: Flush memmap to disk
embeddings_memmap.flush()
print(f"Finished! Saved {max_sentences} embeddings of dim {embedding_dim} to {output_memmap_file}")


Using device: cpu


Reading sentences:   0%|          | 0/1000000 [00:00<?, ?it/s]

Finished! Saved 1000000 embeddings of dim 64 to OpenSubtitles_en_1M_emb_64.dat


In [13]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!cp "/content/OpenSubtitles_en_20M_sorted.txt" "/content/drive/MyDrive/OpenSubtitles_en_20M_sorted.txt"
!cp "/content/OpenSubtitles_en_20M_emb_64.dat" "/content/drive/MyDrive/OpenSubtitles_en_20M_emb_64.dat"