<a href="https://colab.research.google.com/github/shannn1/goodRAG/blob/main/create_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## embeddings

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch
import os
from torch.cuda.amp import autocast
from huggingface_hub import login
from datasets import DatasetDict

dataset = load_dataset("lighteval/natural_questions_clean")
train_data = dataset["train"].select_columns(["id", "title", "document"])
test_data = dataset["validation"].select_columns(["id", "title", "document"])

device = "cuda" if torch.cuda.is_available() else "cpu"
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def compute_embeddings(documents, ctx_encoder, ctx_tokenizer):
    max_length = ctx_tokenizer.model_max_length
    truncated_documents = {
        "title": [t[:max_length] for t in documents["title"]],
        "document": [d[:max_length] for d in documents["document"]],
    }
    inputs = ctx_tokenizer(
        truncated_documents["title"],
        truncated_documents["document"],
        truncation=True,
        max_length=512,
        padding="longest",
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"].to(device)

    with torch.no_grad():
        with autocast():
            embeddings = ctx_encoder(input_ids, return_dict=True).pooler_output

    del input_ids, inputs
    torch.cuda.empty_cache()
    return embeddings.detach().cpu().numpy()


def process_and_save(data, output_dir, batch_size, split_name):
    os.makedirs(output_dir, exist_ok=True)
    num_shards = (len(data) + batch_size - 1) // batch_size
    for shard_index in range(num_shards):
        start = shard_index * batch_size
        end = min(start + batch_size, len(data))
        batch = data.select(range(start, end))

        while True:
            try:
                print(f"Processing shard {shard_index + 1}/{num_shards} for {split_name}.")
                print(f"Remaining GPU memory: {torch.cuda.memory_allocated(device) / 1e6:.2f} MB")

                batch_with_embeddings = batch.map(
                    lambda b: {"embeddings": compute_embeddings(b, ctx_encoder, ctx_tokenizer).tolist()},
                    batched=True,
                    batch_size=batch_size,
                )
                batch_with_embeddings.save_to_disk(os.path.join(output_dir, f"{split_name}_batch_{shard_index}.dataset"))
                del batch_with_embeddings
                break
            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    batch_size = max(batch_size // 2, 1)
                    print(f"Reduced batch size to {batch_size} due to OOM.")
                else:
                    raise e
            except Exception as e:
                print(f"Unexpected error while processing batch: {e}")
                raise e


def upload_batches_to_hub(batches, dataset_name, token):
    login(token=token)

    try:
        existing_dataset = DatasetDict.load_from_hub(dataset_name)
        print(f"Loaded existing dataset '{dataset_name}' from Hugging Face Hub.")
    except FileNotFoundError:
        existing_dataset = None
        print(f"Dataset '{dataset_name}' not found on Hugging Face Hub. Creating a new one.")

    for i, batch in enumerate(batches):
        try:
            if existing_dataset is None:
                dataset_dict = DatasetDict({"train": batch})
                dataset_dict.push_to_hub(dataset_name)
                print(f"Uploaded initial batch {i} to Hugging Face Hub as '{dataset_name}'.")
            else:
                combined_dataset = DatasetDict({
                    "train": concatenate_datasets([existing_dataset["train"], batch])
                })
                combined_dataset.push_to_hub(dataset_name)
                print(f"Appended batch {i} to '{dataset_name}' on Hugging Face Hub.")
                existing_dataset = combined_dataset
        except Exception as e:
            print(f"Error while uploading batch {i} to Hugging Face Hub: {e}")
            raise e

train_batch_size = 256
test_batch_size = 256

process_and_save(train_data, output_dir="./train_batches", batch_size=train_batch_size, split_name="train")
process_and_save(test_data, output_dir="./test_batches", batch_size=test_batch_size, split_name="test")

train_batches = [
    Dataset.load_from_disk(os.path.join("./train_batches", f))
    for f in os.listdir("./train_batches") if f.endswith(".dataset")
]
test_batches = [
    Dataset.load_from_disk(os.path.join("./test_batches", f))
    for f in os.listdir("./test_batches") if f.endswith(".dataset")
]
final_dataset = concatenate_datasets(train_batches + test_batches)

final_dataset.save_to_disk("./final_knowledge_base")
print("Final knowledge base saved to './final_knowledge_base'.")

upload_batches_to_hub(train_batches + test_batches, "knowledge_base_genai", token="change to your token")


## Tips for using AWS

In [None]:
# upload file
scp -i "genai.pem" faiss_create.py ubuntu@ec2-35-153-208-211.compute-1.amazonaws.com

In [None]:
sftp -i genai.pem ubuntu@ec2-35-153-208-211.compute-1.amazonaws.com

In [None]:
put faiss_create.py

In [None]:
# connect to the instance
ssh -i genai.pem ubuntu@ec2-35-153-208-211.compute-1.amazonaws.com

In [None]:
# activate and use python env
source ~/env/bin/activate
python ~/try.py

In [None]:
# deleted folder
rm -rf /path/to/your/folder