In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import scipy.sparse as sp
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import json
import random
from scipy.sparse import vstack, csr_matrix
#nltk.download('stopwords')
#nltk.download('wordnet')
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm
import torch
from transformers import AutoModel,AutoTokenizer
from sentence_transformers.quantization import quantize_embeddings
import os

InteractiveShell.ast_node_interactivity = "all"

#!jupyter labextension install @jupyter-widgets/jupyterlab-manager
#!jupyter nbextension enable --py widgetsnbextension

# Load stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numeric characters
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Return the preprocessed text as a single string
    return ' '.join(tokens)

In [2]:
import torch
print(torch.__version__)          # Should show 1.12.1+cu113
print(torch.version.cuda)         # Should show 11.3
print(torch.cuda.is_available())  # Should return True

2.5.1+cu124
12.4
True


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:

class CORPS:
    minicorpus_docs = None


def cde_embeddings(corpus, tokenizer_model="bert-base-uncased", model_name="jxm/cde-small-v2"):
    """
    Generate HuggingFace model-based embeddings for a corpus.

    Args:
        corpus (list): List of text documents.
        model_name (str): HuggingFace transformer model name.

    Returns:
        list: List of dense embeddings.
    """
    model = SentenceTransformer(model_name, trust_remote_code=True)
    minicorpus_size = model[0].config.transductive_corpus_size
    # Put some strings here that are representative of your corpus, for example by calling random.sample(corpus, k=minicorpus_size)
    if CORPS.minicorpus_docs is None:
      CORPS.minicorpus_docs = random.sample(corpus, k=minicorpus_size)
    # You must use exactly this many documents in the minicorpus. You can oversample if your corpus is smaller.
    assert len(CORPS.minicorpus_docs) == minicorpus_size

    dataset_embeddings = model.encode(
        CORPS.minicorpus_docs,
        prompt_name="document",
        convert_to_tensor=True
    )

    doc_embeddings = []
    for ind, doc in tqdm(enumerate(corpus), total=len(corpus)):
        # Skip empty docs
        if not doc:
            doc_embeddings.append(np.zeros(model.get_sentence_embedding_dimension()))
            continue

        # Check if the document exceeds token limit
        # If it does, split into chunks and use mean pooling
        if len(doc.split()) > 512:
            # Split document into chunks of approximately 500 words
            words = doc.split()
            chunks = [' '.join(words[i:i+500]) for i in range(0, len(words), 500)]

            # Encode each chunk
            chunk_embeddings = []
            for chunk in chunks:
                emb = model.encode(
                    chunk,
                    prompt_name="document",
                    dataset_embeddings=dataset_embeddings,
                    convert_to_tensor=True,
                )
                chunk_embeddings.append(emb)

            # Compute mean embedding across all chunks
            mean_embedding = np.mean(chunk_embeddings, axis=0)
            doc_embeddings.append(mean_embedding)
            continue
        doc_embeddings.append(model.encode(
            doc,
            prompt_name="document",
            dataset_embeddings=dataset_embeddings,
            convert_to_tensor=True,
        ))

    sparse_embeddings = [csr_matrix(embedding.cpu().reshape(1, -1)) for embedding in doc_embeddings]
    return vstack(sparse_embeddings)



## Generate JINA Embedding

In [6]:
def generate_jina_embedding(model,corpus,max_token=8192,truncate_dim=1024):
    doc_embeddings = model.encode(corpus,
                                  convert_to_tensor=True,
                                  show_progress_bar=True,
                                  truncate_dim=truncate_dim)
    print(doc_embeddings[0].shape)
    sparse_embeddings = [csr_matrix(embedding.cpu().numpy()) for embedding in doc_embeddings]
    return vstack(sparse_embeddings)
# To retrieve float32 embeddings for 64,128,256,512] sizes

def get_all():
  import os
  for path in ["train.csv","test.csv"]:
    df= pd.read_csv(path)
    df.loc[df[df['text'] == ''].index,"text"] = None
    df.dropna(subset=["text"],inplace=True)
    for truncate_dim in [64,128,256,512]:
      X = generate_jina_embedding(jina_model,df["text"].tolist(),truncate_dim=truncate_dim)
      y = df["label"].tolist()

      save_path = f"{truncate_dim}/"+path.replace(".csv", "_embeddings.npz")
      os.makedirs(os.path.dirname(save_path), exist_ok=True)
      np.savez(save_path,
                data=X.data,
                indices=X.indices,
                indptr=X.indptr,
                shape=X.shape,
                labels=y)
      files.download(save_path)

In [7]:
import torch

# List all available devices (CPU and GPU)
device_count = torch.cuda.device_count()
if device_count > 0:
    for i in range(device_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU found, using CPU.")

# Check for CPU
print("CPU available:", torch.cuda.is_available())

GPU 0: NVIDIA GeForce RTX 2080 Ti
CPU available: True


In [8]:
def binary_quantize(embeddings):
    return np.where(embeddings > 0, 1, 0).astype(np.uint8)

def int8_quantize(embeddings, calibration_samples=1000):
    # Calibrate using sample data
    scale = np.max(np.abs(embeddings[:calibration_samples])) / 127
    quantized = (embeddings / scale).astype(np.int8)
    return quantized, scale

def binary_cosine(a, b):
    """Fast cosine similarity for binary vectors"""
    intersection = np.dot(a, b)
    norm_a = np.count_nonzero(a)
    norm_b = np.count_nonzero(b)
    return 1- intersection / np.sqrt(norm_a * norm_b)

def int8_dequantize(quantized, scale):
    return quantized.astype(np.float32) * scale

# Option B: Int8 Quantization with Scale Calibration
def int8_quantize(embeddings, calibration_samples=1000):
    # Calibrate using sample data
    scale = np.max(np.abs(embeddings[:calibration_samples])) / 127
    quantized = (embeddings / scale).astype(np.int8)
    return quantized, scale

# Option A: Binary Quantization (1-bit)
def binary_quantize(embeddings):
    return np.where(embeddings > 0, 1, 0).astype(np.uint8)

# 5. Dequantization (when needed)
def int8_dequantize(quantized, scale):
    return quantized.astype(np.float32) * scale


In [9]:
def test_jina():
    
    int8_embeddings, scale_factor = int8_quantize(normalized_embeddings)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    
    jina_sentence_model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True,device=device,truncate_dim=64)
    cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
    
    task = "classification"
    texts = ["What is the weather like in Berlin today?","What is the weather like in Paris today?"]
    embeddings = jina_sentence_model.encode(
        texts,
        task=task,
        prompt_name=task,
        truncate_dim=64
    )
    print("Similarity: ",cos_sim(embeddings[0], embeddings[1]))
    
    with torch.no_grad():
        embeddings = jina_sentence_model.encode(texts, convert_to_tensor=True, truncate_dim=64)  # Shape: [1, 256]
    
    # 2. Convert to float32 for processing
    embeddings = embeddings.cpu().type(torch.float32).numpy()
    
    print( "Normal",
        cos_sim(embeddings[0], embeddings[1])
    )
    
    # 3. Normalize embeddings (critical for quantization)
    normalized_embeddings = embeddings/np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    # 4. Manual Quantization Methods
    
    binary_embeddings = binary_quantize(normalized_embeddings)

#test_jina()

In [10]:
import numpy as np
task = "classification"

def binary_quantize(embeddings):
    return np.where(embeddings > 0, 1, 0).astype(np.uint8)

def int8_quantize(embeddings, calibration_samples=1000):
    # Calibrate using sample data
    scale = np.max(np.abs(embeddings[:calibration_samples])) / 127
    quantized = (embeddings / scale).astype(np.int8)
    return quantized, scale

def binary_cosine(a, b):
    """Fast cosine similarity for binary vectors"""
    intersection = np.dot(a, b)
    norm_a = np.count_nonzero(a)
    norm_b = np.count_nonzero(b)
    return 1- intersection / np.sqrt(norm_a * norm_b)


def generate_quntized_jina_embedding(embedding_model,corpus,max_token=8192,truncate_dim=1024):
    with torch.no_grad():
      doc_embeddings = embedding_model.encode(corpus,
                                  convert_to_tensor=True,
                                  truncate_dim=truncate_dim,
                                  batch_size = 4,
                                  task="classification",
                                  prompt_name=task,
                                  show_progress_bar=True
                                  )

    doc_embeddings = doc_embeddings.cpu().type(torch.float32).numpy()
    normalized_embeddings = doc_embeddings/np.linalg.norm(doc_embeddings, axis=1, keepdims=True)

    binary_embeddings = binary_quantize(normalized_embeddings)
    scalar_embeddings, scale_factor = int8_quantize(normalized_embeddings)

    return vstack([csr_matrix(embedding) for embedding in doc_embeddings]),vstack([csr_matrix(embedding) for embedding in binary_embeddings]),vstack([csr_matrix(embedding) for embedding in scalar_embeddings])


In [11]:
# To retrieve float32 embeddings for 64,128,256,512] sizes
import os
def create_embeddings_from_qmodel(paths=["test.csv","train.csv"],embeds=[64,128,256,512,768,1024],save_base_path="./"):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  for path in paths:
    df= pd.read_csv(path)
    df.loc[df[df['text'] == ''].index,"text"] = None
    df.dropna(subset=["text"],inplace=True)
    embedding_model = None
    for truncate_dim in embeds:
      print("truncate_dim: ",truncate_dim)
      print(device)
      del embedding_model
      embedding_model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True,device=device,truncate_dim=truncate_dim).to(device)
      matryoshka_x ,binary_x, scalar_X = generate_quntized_jina_embedding(embedding_model,df["text"].tolist(),truncate_dim=truncate_dim)
      print("binary shape",binary_x.shape)
      print("scalar shape",scalar_X.shape)
      y = df["label"].tolist()
      matryoshka_x_path = f"{save_base_path}{truncate_dim}/"+os.path.basename(path).replace(".csv", f"{truncate_dim}_embeddings.npz")
      binary_save_path = f"{save_base_path}{truncate_dim}/binary/"+os.path.basename(path).replace(".csv", f"_binary_{truncate_dim}_embeddings.npz")
      scalar_save_path = f"{save_base_path}{truncate_dim}/scalar/"+os.path.basename(path).replace(".csv", f"_scalar_{truncate_dim}_embeddings.npz")
      os.makedirs(os.path.dirname(binary_save_path), exist_ok=True)
      os.makedirs(os.path.dirname(matryoshka_x_path), exist_ok=True)
      os.makedirs(os.path.dirname(scalar_save_path), exist_ok=True)

      np.savez(matryoshka_x_path,
                data=matryoshka_x.data,
                indices=matryoshka_x.indices,
                indptr=matryoshka_x.indptr,
                shape=matryoshka_x.shape,
                labels=y)
      
      np.savez(binary_save_path,
                data=binary_x.data,
                indices=binary_x.indices,
                indptr=binary_x.indptr,
                shape=binary_x.shape,
                labels=y)
      
      np.savez(scalar_save_path,
                data=scalar_X.data,
                indices=scalar_X.indices,
                indptr=scalar_X.indptr,
                shape=scalar_X.shape,
                labels=y)


## 20news JINA Embedding Generation

In [19]:
import torch
torch.cuda.empty_cache()
base_path = "/home/jovyan/master-thesis/data/processed/20news/"
save_path= "/home/jovyan/master-thesis/data/embeddings/jina_20news/"

pathes = [base_path+x for x in os.listdir(base_path)[:4]]
pathes = [pathes[0],pathes[3] ]
print(pathes)


['/home/jovyan/master-thesis/data/processed/20news/20_newsgroups_train.csv', '/home/jovyan/master-thesis/data/processed/20news/20_newsgroups_test.csv']


In [20]:
create_embeddings_from_qmodel(paths=pathes,embeds=[64,128,256,512,768,1024],save_base_path=save_path) # remaining experiment

truncate_dim:  64
cuda


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/1886 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.01 GiB. GPU 0 has a total capacity of 10.74 GiB of which 3.32 GiB is free. Process 2327763 has 7.42 GiB memory in use. Of the allocated memory 6.69 GiB is allocated by PyTorch, and 559.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## AGNews JINA Embedding Generation

In [12]:
import torch
torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
base_path = "/home/jovyan/master-thesis/data/processed/agnews/"
save_path= "/home/jovyan/master-thesis/data/embeddings/jina_agnews/"

pathes = [base_path+x for x in os.listdir(base_path)[:4]]
pathes = pathes[2:]
print(pathes)


['/home/jovyan/master-thesis/data/processed/agnews/train.csv']


In [13]:
create_embeddings_from_qmodel(paths=pathes,embeds=[512,768,1024],save_base_path=save_path) # remaining experiment

truncate_dim:  512
cuda


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/30000 [00:00<?, ?it/s]

KeyboardInterrupt: 

## PubmedRCT20k JINA Embedding Generation

In [12]:
import torch
torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
base_path = "/home/jovyan/master-thesis/data/processed/pubmed20k/"
save_path= "/home/jovyan/master-thesis/data/embeddings/jina_pubmed20k/"

pathes = [base_path+x for x in os.listdir(base_path)[:4]]
pathes = pathes[1:]
print(pathes)


['/home/jovyan/master-thesis/data/processed/pubmed20k/train.csv']


In [13]:
create_embeddings_from_qmodel(paths=pathes,embeds=[128,256,512,768],save_base_path=save_path) # remaining experiment

truncate_dim:  128
cuda


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/45010 [00:00<?, ?it/s]

KeyboardInterrupt: 

## ECHRt JINA Embedding Generation

In [12]:
import torch
torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
base_path = "/home/jovyan/master-thesis/data/processed/echr/"
save_path= "/home/jovyan/master-thesis/data/embeddings/jina_echr/"

pathes = [base_path+x for x in os.listdir(base_path)[:4]]
pathes = pathes
print(pathes)


['/home/jovyan/master-thesis/data/processed/echr/test.csv', '/home/jovyan/master-thesis/data/processed/echr/train.csv']


In [13]:
create_embeddings_from_qmodel(paths=pathes,embeds=[64,128,256,512,768,1024],save_base_path=save_path) # remaining experiment

truncate_dim:  64
cuda


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/1380 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 10.74 GiB of which 1.84 GiB is free. Process 1304538 has 7.10 GiB memory in use. Process 2355779 has 1.80 GiB memory in use. Of the allocated memory 1.22 GiB is allocated by PyTorch, and 409.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Patent JINA Embedding Generation

In [41]:
import torch
torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
base_path = "/home/jovyan/master-thesis/data/processed/patent/"
save_path= "/home/jovyan/master-thesis/data/embeddings/jina_echr/"

pathes = [base_path+x for x in os.listdir(base_path)[:4]]
patent_pathes = pathes[1:]
print(patent_pathes)


['/home/jovyan/master-thesis/data/processed/patent/patent_validation.csv', '/home/jovyan/master-thesis/data/processed/patent/patent_test.csv', '/home/jovyan/master-thesis/data/processed/patent/patent_train.csv']


In [None]:
create_embeddings_from_qmodel(paths=patent_pathes,embeds=[64,128,256,512,768,1024],save_base_path=save_path) # remaining experiment