In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Load model
model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True).cuda()

# Load datasets
train_data = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_data = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_data = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

# Define prompt name for sentence-to-passage tasks
query_prompt_name = "s2p_query"

# Generate embeddings function
def generate_embeddings(data, column_name, prompt_name=None, batch_size=8):
    """
    Generate embeddings for a dataset using the specified column and prompt.
    """
    return model.encode(
        data[column_name].tolist(),
        prompt_name=prompt_name,  # Apply prompt for queries (if specified)
        batch_size=batch_size,
        show_progress_bar=True,
        device="cuda"
    )

# Generate embeddings for all datasets
print("Generating embeddings for training data...")
train_embeddings = generate_embeddings(train_data, "script", prompt_name=query_prompt_name)

print("Generating embeddings for validation data...")
val_embeddings = generate_embeddings(val_data, "script", prompt_name=query_prompt_name)

print("Generating embeddings for test data...")
test_embeddings = generate_embeddings(test_data, "script", prompt_name=query_prompt_name)

# Prepare labels
train_labels = train_data["passed_bechdel"]
val_labels = val_data["passed_bechdel"]
test_labels = test_data["passed_bechdel"]

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Train classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42)
}

# Choose the classifier you want to use
classifier_name = 'Random Forest'  # Change this to 'Logistic Regression' or 'SVM' as needed
classifier = classifiers[classifier_name]

# Train the chosen classifier
print(f"Training {classifier_name}...")
classifier.fit(train_embeddings, train_labels)

# Evaluate on validation data
val_predictions = classifier.predict(val_embeddings)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy ({classifier_name}): {val_accuracy:.4f}")

In [None]:

# Separate test evaluation function
def evaluate_test_accuracy():
    """
    Run this function separately to evaluate the accuracy on the test set.
    """
    test_predictions = classifier.predict(test_embeddings)
    test_accuracy = accuracy_score(test_labels, test_predictions)
    print(f"Test Accuracy ({classifier_name}): {test_accuracy:.4f}")

# To evaluate the test accuracy after making tweaks, simply call:
# evaluate_test_accuracy()