In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Load model
model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True).cuda()

# Load datasets
train_data = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_data = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_data = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

# Define prompt name for sentence-to-passage tasks
query_prompt_name = "s2p_query"

# Generate embeddings function
def generate_embeddings(data, column_name, prompt_name=None, batch_size=8):
    """
    Generate embeddings for a dataset using the specified column and prompt.
    """
    return model.encode(
        data[column_name].tolist(),
        prompt_name=prompt_name,  # Apply prompt for queries (if specified)
        batch_size=batch_size,
        show_progress_bar=True,
        device="cuda"
    )

# Generate embeddings for all datasets
print("Generating embeddings for training data...")
train_embeddings = generate_embeddings(train_data, "script", prompt_name=query_prompt_name)

print("Generating embeddings for validation data...")
val_embeddings = generate_embeddings(val_data, "script", prompt_name=query_prompt_name)

print("Generating embeddings for test data...")
test_embeddings = generate_embeddings(test_data, "script", prompt_name=query_prompt_name)

# Prepare labels
train_labels = train_data["passed_bechdel"]
val_labels = val_data["passed_bechdel"]
test_labels = test_data["passed_bechdel"]

modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/169k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Generating embeddings for training data...


Batches:   0%|          | 0/178 [00:00<?, ?it/s]

Generating embeddings for validation data...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Generating embeddings for test data...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

## Save embeddings

In [2]:
import numpy as np

# Save embeddings
np.save("stella_train_embeddings.npy", train_embeddings)
np.save("stella_val_embeddings.npy", val_embeddings)
np.save("stella_test_embeddings.npy", test_embeddings)

## Load embeddings

In [3]:
import numpy as np

# Load embeddings
train_embeddings = np.load("stella_train_embeddings.npy")
val_embeddings = np.load("stella_val_embeddings.npy")
test_embeddings = np.load("stella_test_embeddings.npy")

## Train classifiers

In [4]:
# Import necessary classifiers and metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Train classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42)
}

# Loop through each classifier
for classifier_name, classifier in classifiers.items():
    # Train the classifier
    print(f"Training {classifier_name}...")
    classifier.fit(train_embeddings, train_labels)

    # Evaluate on validation data
    val_predictions = classifier.predict(val_embeddings)
    
    # Calculate Accuracy and F1 Score
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_f1 = f1_score(val_labels, val_predictions)
    
    # Print results
    print(f"Validation Accuracy ({classifier_name}): {val_accuracy:.4f}")
    print(f"Validation F1 Score ({classifier_name}): {val_f1:.4f}")


Training Logistic Regression...
Validation Accuracy (Logistic Regression): 0.6045
Validation F1 Score (Logistic Regression): 0.5930
Training Random Forest...
Validation Accuracy (Random Forest): 0.7401
Validation F1 Score (Random Forest): 0.7386
Training SVM...
Validation Accuracy (SVM): 0.5650
Validation F1 Score (SVM): 0.5746
