In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Load pre-trained model
model = SentenceTransformer("Salesforce/SFR-Embedding-2_R")

# Load your dataset
train_data = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_data = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_data = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

# Encode scripts into embeddings
print("Generating embeddings for training data...")
train_embeddings = model.encode(train_data["script"].tolist(), batch_size=16, show_progress_bar=True)
print("Generating embeddings for validation data...")
val_embeddings = model.encode(val_data["script"].tolist(), batch_size=16, show_progress_bar=True)
print("Generating embeddings for test data...")
test_embeddings = model.encode(test_data["script"].tolist(), batch_size=16, show_progress_bar=True)

# Prepare labels
train_labels = train_data["passed_bechdel"]
val_labels = val_data["passed_bechdel"]
test_labels = test_data["passed_bechdel"]

# Train a simple classifier (Logistic Regression)
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(train_embeddings, train_labels)

# Evaluate on validation data
val_predictions = classifier.predict(val_embeddings)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Test the model
test_predictions = classifier.predict(test_embeddings)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 21.95 GiB of which 213.00 MiB is free. Process 883256 has 1.86 GiB memory in use. Process 884163 has 19.87 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 5.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Error message pasted below just in case

OutOfMemoryError                          Traceback (most recent call last)
Cell In[1], line 7
      4 import pandas as pd
      6 # Load pre-trained model
----> 7 model = SentenceTransformer("Salesforce/SFR-Embedding-2_R")
      9 # Load your dataset
     10 train_data = pd.read_csv("train_case_sensitive.csv")

File ~/.local/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py:347, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)
    344 except StopIteration:
    345     pass
--> 347 self.to(device)
    348 self.is_hpu_graph_enabled = False
    350 if self.default_prompt_name is not None and self.default_prompt_name not in self.prompts:

File ~/.local/lib/python3.12/site-packages/torch/nn/modules/module.py:1340, in Module.to(self, *args, **kwargs)
   1337         else:
   1338             raise
-> 1340 return self._apply(convert)

File ~/.local/lib/python3.12/site-packages/torch/nn/modules/module.py:900, in Module._apply(self, fn, recurse)
    898 if recurse:
    899     for module in self.children():
--> 900         module._apply(fn)
...
   1330     )
   1331 except NotImplementedError as e:
   1332     if str(e) == "Cannot copy out of meta tensor; no data!":

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 21.95 GiB of which 56.12 MiB is free. Process 880225 has 21.89 GiB memory in use. Of the allocated memory 21.71 GiB is allocated by PyTorch, and 1.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# code after memory optimizations suggested by ChatGPT (not working for me still)

import torch

# Clear GPU cache
torch.cuda.empty_cache()

# Set memory configuration for better management
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Load pre-trained model
model = SentenceTransformer("Salesforce/SFR-Embedding-2_R", device="cuda")  # Ensure model is on GPU

# Load your dataset
train_data = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_data = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_data = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

# Optimize batch size to reduce memory usage
BATCH_SIZE = 8  # Reduce batch size to fit in memory

# Generate embeddings with a reduced batch size
print("Generating embeddings for training data...")
train_embeddings = model.encode(
    train_data["script"].tolist(),
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    device="cuda",  # Ensure embeddings are computed on GPU
    dtype=torch.float16  # Use mixed precision
)

print("Generating embeddings for validation data...")
val_embeddings = model.encode(
    val_data["script"].tolist(),
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    device="cuda",
    dtype=torch.float16
)

print("Generating embeddings for test data...")
test_embeddings = model.encode(
    test_data["script"].tolist(),
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    device="cuda",
    dtype=torch.float16
)

# Prepare labels
train_labels = train_data["passed_bechdel"]
val_labels = val_data["passed_bechdel"]
test_labels = test_data["passed_bechdel"]

# Train a simple classifier (Logistic Regression)
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(train_embeddings, train_labels)

# Evaluate on validation data
val_predictions = classifier.predict(val_embeddings)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Test the model
test_predictions = classifier.predict(test_embeddings)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
