In [5]:
#Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_10000

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /ToSubmit_Part2_O1_MNRL_Prefix5_Multi_GPU_Final_Rv1/step_10000/model"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G


FileNotFoundError: Path /home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /ToSubmit_Part2_O1_MNRL_Prefix5_Multi_GPU_Final_Rv1/step_10000/model not found

In [4]:
#Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_10000

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /ToSubmit_Part2_O1_MNRL_Prefix5_Multi_GPU_Final_Rv1/step_10000/model"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G


FileNotFoundError: Path /home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /ToSubmit_Part2_O1_MNRL_Prefix5_Multi_GPU_Final_Rv1/step_10000/model not found

In [42]:
#Part3_e5_mnrl_DDP_new_v2_Prefix4_Multi_GPU/epoch_1

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/BI ENCODER TRAINING /Part3_e5_mnrl_DDP_new_v2_Prefix4_Multi_GPU/epoch_1"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G


FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv'

In [28]:
#Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_10000

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_10000/model"
butterfly/src/notebooks/20250324_155949_Rv4_bi_encoder_finetuned
# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=2000 ===
K=2000, T=0.9: Precision=0.0076, Recall=0.0001, F1=0.0002
K=2000, T=0.85: Precision=0.1706, Recall=0.0045, F1=0.0086
K=2000, T=0.8: Precision=0.3336, Recall=0.0159, F1=0.0293
K=2000, T=0.7: Precision=0.3074, Recall=0.0678, F1=0.0985
K=2000, T=0.5: Precision=0.1045, Recall=0.3858, F1=0.1495
K=2000, T=0.4: Precision=0.0492, Recall=0.6053, F1=0.0872
=== EVALUATION APPROACH 2: RANKING-BASED ===
=== Testing with K=2000 ===
K=2000, Top_N=1: Precision=0.2734, Recall=0.1444, F1=0.1763
K=2000, Top_N=3: Precision=0.1886, Recall=0.2704, F1=0.2073
K=2000, Top_N=5: Precision=0.1496, Recall=0.3383, F1=0.1950
K=2000, Top_N=10: Precision=0.1049, Recall=0.4355, F1=0.1605
K=2000, Top_N=15: Precision=0.0839, Recall=0.4999, F1=0.1377
K=2000, Top_N=20: Precision=0.0713, Recall=0.5438, F1=0.1214
K=2000, Top_N=30: Precision=0.0555, Recall=0.6004, F1=0.0987
K=2000, Top_N=50: Precision=0.

In [41]:
#20250324_155949_Rv4_bi_encoder_finetuned

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/20250324_155949_Rv4_bi_encoder_finetuned"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

You try to use a model that was created with version 3.4.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=2000 ===
K=2000, T=0.9: Precision=0.0000, Recall=0.0000, F1=0.0000
K=2000, T=0.85: Precision=0.0051, Recall=0.0001, F1=0.0002
K=2000, T=0.8: Precision=0.0396, Recall=0.0006, F1=0.0011
K=2000, T=0.7: Precision=0.2022, Recall=0.0070, F1=0.0132
K=2000, T=0.5: Precision=0.2248, Recall=0.1884, F1=0.1721
K=2000, T=0.4: Precision=0.0870, Recall=0.4179, F1=0.1306
=== EVALUATION APPROACH 2: RANKING-BASED ===
=== Testing with K=2000 ===
K=2000, Top_N=1: Precision=0.2525, Recall=0.1334, F1=0.1634
K=2000, Top_N=3: Precision=0.1750, Recall=0.2379, F1=0.1868
K=2000, Top_N=5: Precision=0.1393, Recall=0.3010, F1=0.1780
K=2000, Top_N=10: Precision=0.0997, Recall=0.3942, F1=0.1500
K=2000, Top_N=15: Precision=0.0800, Recall=0.4516, F1=0.1293
K=2000, Top_N=20: Precision=0.0676, Recall=0.4934, F1=0.1139
K=2000, Top_N=30: Precision=0.0529, Recall=0.5472, F1=0.0933
K=2000, Top_N=50: Precision=0.

In [35]:
#Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_20000

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Prefix5_Multi_GPU_Final/final_model"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=2000 ===
K=2000, T=0.9: Precision=0.0000, Recall=0.0000, F1=0.0000
K=2000, T=0.85: Precision=0.0480, Recall=0.0014, F1=0.0027
K=2000, T=0.8: Precision=0.2034, Recall=0.0054, F1=0.0103
K=2000, T=0.7: Precision=0.4041, Recall=0.0456, F1=0.0770
K=2000, T=0.5: Precision=0.1385, Recall=0.3239, F1=0.1761
K=2000, T=0.4: Precision=0.0625, Recall=0.5165, F1=0.1071
=== EVALUATION APPROACH 2: RANKING-BASED ===
=== Testing with K=2000 ===
K=2000, Top_N=1: Precision=0.2515, Recall=0.1320, F1=0.1619
K=2000, Top_N=3: Precision=0.1742, Recall=0.2407, F1=0.1879
K=2000, Top_N=5: Precision=0.1387, Recall=0.3046, F1=0.1789
K=2000, Top_N=10: Precision=0.0967, Recall=0.4007, F1=0.1488
K=2000, Top_N=15: Precision=0.0775, Recall=0.4535, F1=0.1270
K=2000, Top_N=20: Precision=0.0661, Recall=0.4946, F1=0.1125
K=2000, Top_N=30: Precision=0.0519, Recall=0.5521, F1=0.0924
K=2000, Top_N=50: Precision=0.

KeyboardInterrupt: 

In [36]:
#LOOP THROUGH ALL


import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Base directory path
base_dir = "/home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Prefix5_Multi_GPU_Final"

# Load the datasets once since they don't change
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Prepare the data
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Prepare ground truth
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

def evaluate_model(model_path, K=2000, top_n=1600):
    """Evaluate a single model and return its recall score"""
    try:
        # Load the bi-encoder model
        model = SentenceTransformer(model_path)
        model.max_seq_length = max_seq_length
        if torch.cuda.is_available():
            model = model.to(device)
        
        # Create formatted passages for legacy attributes
        legacy_attributes_with_prefix = []
        for i in range(len(legacy_df)):
            label = legacy_df['label'].iloc[i]
            definition = legacy_df['definition'].iloc[i]
            prefixed_text = f"passage: {label}. {definition}"
            legacy_attributes_with_prefix.append(prefixed_text)
        
        # Encode with normalization, batch for large datasets
        batch_size = 128
        legacy_embeddings = []
        
        for i in range(0, len(legacy_attributes_with_prefix), batch_size):
            batch = legacy_attributes_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            legacy_embeddings.append(batch_embeddings)
        
        legacy_embeddings = np.vstack(legacy_embeddings)
        
        # Create FAISS index
        dimension = legacy_embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(legacy_embeddings)
        
        # Format queries with proper prefix
        sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]
        
        # Encode sentences in batches
        sentence_embeddings = []
        for i in range(0, len(sentences_with_prefix), batch_size):
            batch = sentences_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            sentence_embeddings.append(batch_embeddings)
        
        sentence_embeddings = np.vstack(sentence_embeddings)
        
        # Get search results
        D_faiss, I_faiss = index.search(sentence_embeddings, K)
        
        # Process results using ranking approach
        job_predictions = {}
        for i, row in enumerate(sentences_df.itertuples()):
            job_id = row.job_id
            
            if job_id not in job_predictions:
                job_predictions[job_id] = set()
            
            # Get labels from top N results regardless of similarity score
            for j in range(min(top_n, len(I_faiss[i]))):
                idx_pos = I_faiss[i][j]
                if idx_pos >= 0:
                    label = legacy_df['label'].iloc[idx_pos]
                    job_predictions[job_id].add(normalize_attribute(label))
        
        # Calculate metrics
        recalls = []
        
        for job_id in ground_truth:
            gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
            pred = set(job_predictions.get(job_id, []))
            tp = len(gt.intersection(pred))
            fn = len(gt - pred)
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            recalls.append(recall)
        
        avg_recall = np.mean(recalls)
        
        # Clean up
        del model
        del legacy_embeddings
        del sentence_embeddings
        del index
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return avg_recall
    
    except Exception as e:
        print(f"Error evaluating model at {model_path}: {str(e)}")
        return None

# Find all model directories
model_results = []

# Look for all directories containing 'epoch' or 'step' in the base directory
for folder_name in os.listdir(base_dir):
    if 'epoch' in folder_name or 'step' in folder_name:
        model_path = os.path.join(base_dir, folder_name, 'model')
        if os.path.exists(model_path) and os.path.isdir(model_path):
            print(f"\nEvaluating model: {folder_name}")
            recall = evaluate_model(model_path)
            if recall is not None:
                model_results.append({
                    'folder': folder_name,
                    'path': model_path,
                    'recall': recall
                })
                print(f"Recall for {folder_name}: {recall:.4f}")

# Also check the final_model if it exists
final_model_path = os.path.join(base_dir, 'final_model')
if os.path.exists(final_model_path) and os.path.isdir(final_model_path):
    print(f"\nEvaluating model: final_model")
    recall = evaluate_model(final_model_path)
    if recall is not None:
        model_results.append({
            'folder': 'final_model',
            'path': final_model_path,
            'recall': recall
        })
        print(f"Recall for final_model: {recall:.4f}")

# Sort results by recall (descending)
model_results = sorted(model_results, key=lambda x: x['recall'], reverse=True)

# Display results summary
print("\n" + "="*50)
print("MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)")
print("="*50)
for i, result in enumerate(model_results):
    print(f"{i+1}. {result['folder']}: Recall = {result['recall']:.4f}")
    print(f"   Path: {result['path']}")
    print("-"*50)

if model_results:
    best_model = model_results[0]
    print(f"\nBEST MODEL: {best_model['folder']}")
    print(f"Recall: {best_model['recall']:.4f}")
    print(f"Path: {best_model['path']}")
else:
    print("\nNo models found or all evaluations failed.")

Using GPU for SentenceTransformer: NVIDIA A10G

Evaluating model: epoch_1
Recall for epoch_1: 0.9233

Evaluating model: epoch_4
Recall for epoch_4: 0.9046

Evaluating model: step_70000
Recall for step_70000: 0.9041

Evaluating model: step_20000
Recall for step_20000: 0.9195

Evaluating model: step_50000
Recall for step_50000: 0.9103

Evaluating model: epoch_5
Recall for epoch_5: 0.9038

Evaluating model: step_30000
Recall for step_30000: 0.9175

Evaluating model: step_80000
Recall for step_80000: 0.9026

Evaluating model: epoch_3
Recall for epoch_3: 0.9047

Evaluating model: step_10000
Recall for step_10000: 0.9284

Evaluating model: step_40000
Recall for step_40000: 0.9063

Evaluating model: step_60000
Recall for step_60000: 0.9036

Evaluating model: epoch_2
Recall for epoch_2: 0.9108

Evaluating model: final_model
Recall for final_model: 0.9038

MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)
1. step_10000: Recall = 0.9284
   Path: /home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Pre

In [39]:
#LOOP THROUGH ALL with Prefix (Part2_O1_MNRL_Prefix5_Multi_GPU_Final)


import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Base directory path
base_dir = "/home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Prefix5_Multi_GPU_Final"

# Load the datasets once since they don't change
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Prepare the data
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Prepare ground truth
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

def evaluate_model(model_path, K=2000, top_n=1600):
    """Evaluate a single model and return its recall score"""
    try:
        # Load the bi-encoder model
        model = SentenceTransformer(model_path)
        model.max_seq_length = max_seq_length
        if torch.cuda.is_available():
            model = model.to(device)
        
        # Create formatted passages for legacy attributes
        legacy_attributes_with_prefix = []
        for i in range(len(legacy_df)):
            label = legacy_df['label'].iloc[i]
            definition = legacy_df['definition'].iloc[i]
            prefixed_text = f"passage: {label}. {definition}"
            legacy_attributes_with_prefix.append(prefixed_text)
        
        # Encode with normalization, batch for large datasets
        batch_size = 128
        legacy_embeddings = []
        
        for i in range(0, len(legacy_attributes_with_prefix), batch_size):
            batch = legacy_attributes_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            legacy_embeddings.append(batch_embeddings)
        
        legacy_embeddings = np.vstack(legacy_embeddings)
        
        # Create FAISS index
        dimension = legacy_embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(legacy_embeddings)
        
        # Format queries with proper prefix
        sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]
        
        # Encode sentences in batches
        sentence_embeddings = []
        for i in range(0, len(sentences_with_prefix), batch_size):
            batch = sentences_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            sentence_embeddings.append(batch_embeddings)
        
        sentence_embeddings = np.vstack(sentence_embeddings)
        
        # Get search results
        D_faiss, I_faiss = index.search(sentence_embeddings, K)
        
        # Process results using ranking approach
        job_predictions = {}
        for i, row in enumerate(sentences_df.itertuples()):
            job_id = row.job_id
            
            if job_id not in job_predictions:
                job_predictions[job_id] = set()
            
            # Get labels from top N results regardless of similarity score
            for j in range(min(top_n, len(I_faiss[i]))):
                idx_pos = I_faiss[i][j]
                if idx_pos >= 0:
                    label = legacy_df['label'].iloc[idx_pos]
                    job_predictions[job_id].add(normalize_attribute(label))
        
        # Calculate metrics
        recalls = []
        
        for job_id in ground_truth:
            gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
            pred = set(job_predictions.get(job_id, []))
            tp = len(gt.intersection(pred))
            fn = len(gt - pred)
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            recalls.append(recall)
        
        avg_recall = np.mean(recalls)
        
        # Clean up
        del model
        del legacy_embeddings
        del sentence_embeddings
        del index
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return avg_recall
    
    except Exception as e:
        print(f"Error evaluating model at {model_path}: {str(e)}")
        return None

# Find all model directories
model_results = []

# Look for all directories containing 'epoch' or 'step' in the base directory
for folder_name in os.listdir(base_dir):
    if 'epoch' in folder_name or 'step' in folder_name:
        model_path = os.path.join(base_dir, folder_name, 'model')
        if os.path.exists(model_path) and os.path.isdir(model_path):
            print(f"\nEvaluating model: {folder_name}")
            recall = evaluate_model(model_path)
            if recall is not None:
                model_results.append({
                    'folder': folder_name,
                    'path': model_path,
                    'recall': recall
                })
                print(f"Recall for {folder_name}: {recall:.4f}")

# Also check the final_model if it exists
final_model_path = os.path.join(base_dir, 'final_model')
if os.path.exists(final_model_path) and os.path.isdir(final_model_path):
    print(f"\nEvaluating model: final_model")
    recall = evaluate_model(final_model_path)
    if recall is not None:
        model_results.append({
            'folder': 'final_model',
            'path': final_model_path,
            'recall': recall
        })
        print(f"Recall for final_model: {recall:.4f}")

# Sort results by recall (descending)
model_results = sorted(model_results, key=lambda x: x['recall'], reverse=True)

# Display results summary
print("\n" + "="*50)
print("MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)")
print("="*50)
for i, result in enumerate(model_results):
    print(f"{i+1}. {result['folder']}: Recall = {result['recall']:.4f}")
    print(f"   Path: {result['path']}")
    print("-"*50)

if model_results:
    best_model = model_results[0]
    print(f"\nBEST MODEL: {best_model['folder']}")
    print(f"Recall: {best_model['recall']:.4f}")
    print(f"Path: {best_model['path']}")
else:
    print("\nNo models found or all evaluations failed.")

Using GPU for SentenceTransformer: NVIDIA A10G

Evaluating model: epoch_1
Recall for epoch_1: 0.9233

Evaluating model: epoch_4
Recall for epoch_4: 0.9046

Evaluating model: step_70000
Recall for step_70000: 0.9041

Evaluating model: step_20000
Recall for step_20000: 0.9195

Evaluating model: step_50000
Recall for step_50000: 0.9103

Evaluating model: epoch_5
Recall for epoch_5: 0.9038

Evaluating model: step_30000
Recall for step_30000: 0.9175

Evaluating model: step_80000
Recall for step_80000: 0.9026

Evaluating model: epoch_3
Recall for epoch_3: 0.9047

Evaluating model: step_10000
Recall for step_10000: 0.9284

Evaluating model: step_40000
Recall for step_40000: 0.9063

Evaluating model: step_60000
Recall for step_60000: 0.9036

Evaluating model: epoch_2
Recall for epoch_2: 0.9108

Evaluating model: final_model
Recall for final_model: 0.9038

MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)
1. step_10000: Recall = 0.9284
   Path: /home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Pre

In [40]:
#LOOP THROUGH ALL with Prefix (Part2_O1_MNRL_Prefix5_Multi_GPU_Final)


import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Base directory path
base_dir = "/home/jovyan/butterfly/src/notebooks/runs/e5_improved"

# Load the datasets once since they don't change
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Prepare the data
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Prepare ground truth
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

def evaluate_model(model_path, K=2000, top_n=1600):
    """Evaluate a single model and return its recall score"""
    try:
        # Load the bi-encoder model
        model = SentenceTransformer(model_path)
        model.max_seq_length = max_seq_length
        if torch.cuda.is_available():
            model = model.to(device)
        
        # Create formatted passages for legacy attributes
        legacy_attributes_with_prefix = []
        for i in range(len(legacy_df)):
            label = legacy_df['label'].iloc[i]
            definition = legacy_df['definition'].iloc[i]
            prefixed_text = f"passage: {label}. {definition}"
            legacy_attributes_with_prefix.append(prefixed_text)
        
        # Encode with normalization, batch for large datasets
        batch_size = 128
        legacy_embeddings = []
        
        for i in range(0, len(legacy_attributes_with_prefix), batch_size):
            batch = legacy_attributes_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            legacy_embeddings.append(batch_embeddings)
        
        legacy_embeddings = np.vstack(legacy_embeddings)
        
        # Create FAISS index
        dimension = legacy_embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(legacy_embeddings)
        
        # Format queries with proper prefix
        sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]
        
        # Encode sentences in batches
        sentence_embeddings = []
        for i in range(0, len(sentences_with_prefix), batch_size):
            batch = sentences_with_prefix[i:i+batch_size]
            batch_embeddings = model.encode(batch, normalize_embeddings=True)
            sentence_embeddings.append(batch_embeddings)
        
        sentence_embeddings = np.vstack(sentence_embeddings)
        
        # Get search results
        D_faiss, I_faiss = index.search(sentence_embeddings, K)
        
        # Process results using ranking approach
        job_predictions = {}
        for i, row in enumerate(sentences_df.itertuples()):
            job_id = row.job_id
            
            if job_id not in job_predictions:
                job_predictions[job_id] = set()
            
            # Get labels from top N results regardless of similarity score
            for j in range(min(top_n, len(I_faiss[i]))):
                idx_pos = I_faiss[i][j]
                if idx_pos >= 0:
                    label = legacy_df['label'].iloc[idx_pos]
                    job_predictions[job_id].add(normalize_attribute(label))
        
        # Calculate metrics
        recalls = []
        
        for job_id in ground_truth:
            gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
            pred = set(job_predictions.get(job_id, []))
            tp = len(gt.intersection(pred))
            fn = len(gt - pred)
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            recalls.append(recall)
        
        avg_recall = np.mean(recalls)
        
        # Clean up
        del model
        del legacy_embeddings
        del sentence_embeddings
        del index
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return avg_recall
    
    except Exception as e:
        print(f"Error evaluating model at {model_path}: {str(e)}")
        return None

# Find all model directories
model_results = []

# Look for all directories containing 'epoch' or 'step' in the base directory
for folder_name in os.listdir(base_dir):
    if 'epoch' in folder_name or 'step' in folder_name:
        model_path = os.path.join(base_dir, folder_name, 'model')
        if os.path.exists(model_path) and os.path.isdir(model_path):
            print(f"\nEvaluating model: {folder_name}")
            recall = evaluate_model(model_path)
            if recall is not None:
                model_results.append({
                    'folder': folder_name,
                    'path': model_path,
                    'recall': recall
                })
                print(f"Recall for {folder_name}: {recall:.4f}")

# Also check the final_model if it exists
final_model_path = os.path.join(base_dir, 'final_model')
if os.path.exists(final_model_path) and os.path.isdir(final_model_path):
    print(f"\nEvaluating model: final_model")
    recall = evaluate_model(final_model_path)
    if recall is not None:
        model_results.append({
            'folder': 'final_model',
            'path': final_model_path,
            'recall': recall
        })
        print(f"Recall for final_model: {recall:.4f}")

# Sort results by recall (descending)
model_results = sorted(model_results, key=lambda x: x['recall'], reverse=True)

# Display results summary
print("\n" + "="*50)
print("MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)")
print("="*50)
for i, result in enumerate(model_results):
    print(f"{i+1}. {result['folder']}: Recall = {result['recall']:.4f}")
    print(f"   Path: {result['path']}")
    print("-"*50)

if model_results:
    best_model = model_results[0]
    print(f"\nBEST MODEL: {best_model['folder']}")
    print(f"Recall: {best_model['recall']:.4f}")
    print(f"Path: {best_model['path']}")
else:
    print("\nNo models found or all evaluations failed.")

Using GPU for SentenceTransformer: NVIDIA A10G

Evaluating model: epoch_1
Recall for epoch_1: 0.9251

Evaluating model: epoch_4
Recall for epoch_4: 0.9079

Evaluating model: step_20000
Recall for step_20000: 0.9215

Evaluating model: step_50000
Recall for step_50000: 0.9122

Evaluating model: step_30000
Recall for step_30000: 0.9197

Evaluating model: epoch_3
Recall for epoch_3: 0.9083

Evaluating model: step_10000
Recall for step_10000: 0.9290

Evaluating model: step_40000
Recall for step_40000: 0.9074

Evaluating model: step_60000
Recall for step_60000: 0.9077

Evaluating model: epoch_2
Recall for epoch_2: 0.9148

MODEL EVALUATION SUMMARY (K=2000, Top_N=1600)
1. step_10000: Recall = 0.9290
   Path: /home/jovyan/butterfly/src/notebooks/runs/e5_improved/step_10000/model
--------------------------------------------------
2. epoch_1: Recall = 0.9251
   Path: /home/jovyan/butterfly/src/notebooks/runs/e5_improved/epoch_1/model
--------------------------------------------------
3. step_2000

In [37]:
#Part3_e5_mnrl_DDP_new_v2_Prefix4_Multi_GPU/epoch_1

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/Part2_O1_MNRL_Prefix5_Multi_GPU_Final/step_10000/model"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Only test K=500 for both evaluation approaches
K = 2000
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5, 0.4]  # Original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200, 300, 400, 500, 1000, 1500, 1600]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
print(f"\n=== Testing with K={K} ===")
for threshold in thresholds:
    avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
    print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("=== EVALUATION APPROACH 2: RANKING-BASED ===")
print(f"=== Testing with K={K} ===")
for top_n in top_n_values:
    if top_n <= K:  # Can't have top_n greater than K
        avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
        print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=2000 ===
K=2000, T=0.9: Precision=0.0051, Recall=0.0001, F1=0.0002
K=2000, T=0.85: Precision=0.0896, Recall=0.0022, F1=0.0042
K=2000, T=0.8: Precision=0.2866, Recall=0.0103, F1=0.0196
K=2000, T=0.7: Precision=0.3202, Recall=0.0586, F1=0.0893
K=2000, T=0.5: Precision=0.1057, Recall=0.3947, F1=0.1511
K=2000, T=0.4: Precision=0.0476, Recall=0.6169, F1=0.0845
=== EVALUATION APPROACH 2: RANKING-BASED ===
=== Testing with K=2000 ===
K=2000, Top_N=1: Precision=0.2720, Recall=0.1460, F1=0.1777
K=2000, Top_N=3: Precision=0.1915, Recall=0.2740, F1=0.2106
K=2000, Top_N=5: Precision=0.1507, Recall=0.3419, F1=0.1968
K=2000, Top_N=10: Precision=0.1059, Recall=0.4444, F1=0.1627
K=2000, Top_N=15: Precision=0.0845, Recall=0.5039, F1=0.1387
K=2000, Top_N=20: Precision=0.0714, Recall=0.5456, F1=0.1217
K=2000, Top_N=30: Precision=0.0559, Recall=0.6054, F1=0.0994
K=2000, Top_N=50: Precision=0.

In [4]:
#PROPER - e5_mnrl_DDP_new/epoch_2
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_new/epoch_2"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test both evaluation approaches
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]  # Keeping your original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
threshold_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
        threshold_results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("\n=== EVALUATION APPROACH 2: RANKING-BASED ===")
ranking_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for top_n in top_n_values:
        if top_n <= K:  # Can't have top_n greater than K
            avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
            ranking_results.append({
                'K': K,
                'Top_N': top_n,
                'Precision': avg_precision,
                'Recall': avg_recall,
                'F1': avg_f1
            })
            print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configurations
best_threshold_recall = max(threshold_results, key=lambda x: x['Recall'])
best_ranking_recall = max(ranking_results, key=lambda x: x['Recall'])
best_threshold_f1 = max(threshold_results, key=lambda x: x['F1'])
best_ranking_f1 = max(ranking_results, key=lambda x: x['F1'])

print("\n=== BEST CONFIGURATIONS ===")
print("\nBest Threshold Configuration for Recall:")
print(f"K={best_threshold_recall['K']}, T={best_threshold_recall['Threshold']}")
print(f"Precision={best_threshold_recall['Precision']:.4f}, Recall={best_threshold_recall['Recall']:.4f}, F1={best_threshold_recall['F1']:.4f}")

print("\nBest Ranking Configuration for Recall:")
print(f"K={best_ranking_recall['K']}, Top_N={best_ranking_recall['Top_N']}")
print(f"Precision={best_ranking_recall['Precision']:.4f}, Recall={best_ranking_recall['Recall']:.4f}, F1={best_ranking_recall['F1']:.4f}")

print("\nBest Threshold Configuration for F1:")
print(f"K={best_threshold_f1['K']}, T={best_threshold_f1['Threshold']}")
print(f"Precision={best_threshold_f1['Precision']:.4f}, Recall={best_threshold_f1['Recall']:.4f}, F1={best_threshold_f1['F1']:.4f}")

print("\nBest Ranking Configuration for F1:")
print(f"K={best_ranking_f1['K']}, Top_N={best_ranking_f1['Top_N']}")
print(f"Precision={best_ranking_f1['Precision']:.4f}, Recall={best_ranking_f1['Recall']:.4f}, F1={best_ranking_f1['F1']:.4f}")

# Compare overall best performance between methods
print("\n=== COMPARISON OF METHODS ===")
if best_ranking_recall['Recall'] > best_threshold_recall['Recall']:
    print("Ranking-based method achieved better recall")
else:
    print("Threshold-based method achieved better recall")

if best_ranking_f1['F1'] > best_threshold_f1['F1']:
    print("Ranking-based method achieved better F1 score")
else:
    print("Threshold-based method achieved better F1 score")

You try to use a model that was created with version 4.1.0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.2606, Recall=0.0227, F1=0.0385
K=30, T=0.85: Precision=0.2331, Recall=0.0519, F1=0.0748
K=30, T=0.8: Precision=0.1923, Recall=0.0945, F1=0.1098
K=30, T=0.7: Precision=0.1204, Recall=0.2140, F1=0.1389
K=30, T=0.5: Precision=0.0525, Recall=0.4529, F1=0.0908

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.2604, Recall=0.0227, F1=0.0385
K=50, T=0.85: Precision=0.2310, Recall=0.0522, F1=0.0744
K=50, T=0.8: Precision=0.1888, Recall=0.0961, F1=0.1088
K=50, T=0.7: Precision=0.1154, Recall=0.2202, F1=0.1354
K=50, T=0.5: Precision=0.0424, Recall=0.4978, F1=0.0758

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.2604, Recall=0.0227, F1=0.0385
K=80, T=0.85: Precision=0.2309, Recall=0.0522, F1=0.0743
K=80, T=0.8: Precision=0.1874, Recall=0.0964, F1=0.1080
K=80, T=0.7: Precision=0.1123, Recall=0.2228, F1=0.1326
K=80, T=0.5: Precision=0.0357, Recall

In [5]:
#PROPER - e5_mnrl_DDP_new/epoch_3
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_new/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test both evaluation approaches
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]  # Keeping your original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100, 150, 200]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
threshold_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
        threshold_results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("\n=== EVALUATION APPROACH 2: RANKING-BASED ===")
ranking_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for top_n in top_n_values:
        if top_n <= K:  # Can't have top_n greater than K
            avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
            ranking_results.append({
                'K': K,
                'Top_N': top_n,
                'Precision': avg_precision,
                'Recall': avg_recall,
                'F1': avg_f1
            })
            print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configurations
best_threshold_recall = max(threshold_results, key=lambda x: x['Recall'])
best_ranking_recall = max(ranking_results, key=lambda x: x['Recall'])
best_threshold_f1 = max(threshold_results, key=lambda x: x['F1'])
best_ranking_f1 = max(ranking_results, key=lambda x: x['F1'])

print("\n=== BEST CONFIGURATIONS ===")
print("\nBest Threshold Configuration for Recall:")
print(f"K={best_threshold_recall['K']}, T={best_threshold_recall['Threshold']}")
print(f"Precision={best_threshold_recall['Precision']:.4f}, Recall={best_threshold_recall['Recall']:.4f}, F1={best_threshold_recall['F1']:.4f}")

print("\nBest Ranking Configuration for Recall:")
print(f"K={best_ranking_recall['K']}, Top_N={best_ranking_recall['Top_N']}")
print(f"Precision={best_ranking_recall['Precision']:.4f}, Recall={best_ranking_recall['Recall']:.4f}, F1={best_ranking_recall['F1']:.4f}")

print("\nBest Threshold Configuration for F1:")
print(f"K={best_threshold_f1['K']}, T={best_threshold_f1['Threshold']}")
print(f"Precision={best_threshold_f1['Precision']:.4f}, Recall={best_threshold_f1['Recall']:.4f}, F1={best_threshold_f1['F1']:.4f}")

print("\nBest Ranking Configuration for F1:")
print(f"K={best_ranking_f1['K']}, Top_N={best_ranking_f1['Top_N']}")
print(f"Precision={best_ranking_f1['Precision']:.4f}, Recall={best_ranking_f1['Recall']:.4f}, F1={best_ranking_f1['F1']:.4f}")

# Compare overall best performance between methods
print("\n=== COMPARISON OF METHODS ===")
if best_ranking_recall['Recall'] > best_threshold_recall['Recall']:
    print("Ranking-based method achieved better recall")
else:
    print("Threshold-based method achieved better recall")

if best_ranking_f1['F1'] > best_threshold_f1['F1']:
    print("Ranking-based method achieved better F1 score")
else:
    print("Threshold-based method achieved better F1 score")

You try to use a model that was created with version 4.1.0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.2353, Recall=0.0642, F1=0.0875
K=30, T=0.85: Precision=0.1652, Recall=0.1324, F1=0.1260
K=30, T=0.8: Precision=0.1106, Recall=0.2052, F1=0.1287
K=30, T=0.7: Precision=0.0596, Recall=0.3560, F1=0.0981
K=30, T=0.5: Precision=0.0441, Recall=0.4521, F1=0.0780

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.2335, Recall=0.0643, F1=0.0870
K=50, T=0.85: Precision=0.1622, Recall=0.1341, F1=0.1248
K=50, T=0.8: Precision=0.1063, Recall=0.2116, F1=0.1256
K=50, T=0.7: Precision=0.0524, Recall=0.3860, F1=0.0890
K=50, T=0.5: Precision=0.0340, Recall=0.5185, F1=0.0624

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.2333, Recall=0.0644, F1=0.0870
K=80, T=0.85: Precision=0.1612, Recall=0.1344, F1=0.1243
K=80, T=0.8: Precision=0.1037, Recall=0.2144, F1=0.1233
K=80, T=0.7: Precision=0.0477, Recall=0.4055, F1=0.0825
K=80, T=0.5: Precision=0.0263, Recall

In [2]:
#PROPER - e5_mnrl_DDP_with_prefixes_4gpu/epoch_1
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_with_prefixes_4gpu/epoch_1"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test both evaluation approaches
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]  # Keeping your original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
threshold_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
        threshold_results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("\n=== EVALUATION APPROACH 2: RANKING-BASED ===")
ranking_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for top_n in top_n_values:
        if top_n <= K:  # Can't have top_n greater than K
            avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
            ranking_results.append({
                'K': K,
                'Top_N': top_n,
                'Precision': avg_precision,
                'Recall': avg_recall,
                'F1': avg_f1
            })
            print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configurations
best_threshold_recall = max(threshold_results, key=lambda x: x['Recall'])
best_ranking_recall = max(ranking_results, key=lambda x: x['Recall'])
best_threshold_f1 = max(threshold_results, key=lambda x: x['F1'])
best_ranking_f1 = max(ranking_results, key=lambda x: x['F1'])

print("\n=== BEST CONFIGURATIONS ===")
print("\nBest Threshold Configuration for Recall:")
print(f"K={best_threshold_recall['K']}, T={best_threshold_recall['Threshold']}")
print(f"Precision={best_threshold_recall['Precision']:.4f}, Recall={best_threshold_recall['Recall']:.4f}, F1={best_threshold_recall['F1']:.4f}")

print("\nBest Ranking Configuration for Recall:")
print(f"K={best_ranking_recall['K']}, Top_N={best_ranking_recall['Top_N']}")
print(f"Precision={best_ranking_recall['Precision']:.4f}, Recall={best_ranking_recall['Recall']:.4f}, F1={best_ranking_recall['F1']:.4f}")

print("\nBest Threshold Configuration for F1:")
print(f"K={best_threshold_f1['K']}, T={best_threshold_f1['Threshold']}")
print(f"Precision={best_threshold_f1['Precision']:.4f}, Recall={best_threshold_f1['Recall']:.4f}, F1={best_threshold_f1['F1']:.4f}")

print("\nBest Ranking Configuration for F1:")
print(f"K={best_ranking_f1['K']}, Top_N={best_ranking_f1['Top_N']}")
print(f"Precision={best_ranking_f1['Precision']:.4f}, Recall={best_ranking_f1['Recall']:.4f}, F1={best_ranking_f1['F1']:.4f}")

# Compare overall best performance between methods
print("\n=== COMPARISON OF METHODS ===")
if best_ranking_recall['Recall'] > best_threshold_recall['Recall']:
    print("Ranking-based method achieved better recall")
else:
    print("Threshold-based method achieved better recall")

if best_ranking_f1['F1'] > best_threshold_f1['F1']:
    print("Ranking-based method achieved better F1 score")
else:
    print("Threshold-based method achieved better F1 score")

Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.2610, Recall=0.0196, F1=0.0343
K=30, T=0.85: Precision=0.2295, Recall=0.0496, F1=0.0724
K=30, T=0.8: Precision=0.1707, Recall=0.0955, F1=0.1079
K=30, T=0.7: Precision=0.0987, Recall=0.2253, F1=0.1253
K=30, T=0.5: Precision=0.0471, Recall=0.4509, F1=0.0828

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.2610, Recall=0.0196, F1=0.0343
K=50, T=0.85: Precision=0.2283, Recall=0.0496, F1=0.0719
K=50, T=0.8: Precision=0.1674, Recall=0.0964, F1=0.1064
K=50, T=0.7: Precision=0.0930, Recall=0.2331, F1=0.1204
K=50, T=0.5: Precision=0.0371, Recall=0.5030, F1=0.0673

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.2610, Recall=0.0196, F1=0.0343
K=80, T=0.85: Precision=0.2283, Recall=0.0496, F1=0.0719
K=80, T=0.8: Precision=0.1660, Recall=0.0965, F1=0.1055
K=80, T=0.7: Precision=0.0902, Recall=0.2377, F1=0.1177
K=80, T=0.5: Precision=0.0305, Recall

In [4]:
!pip install \
    pandas==2.1.4 \
    scipy==1.11.4 \
    scikit-learn==1.3.2 \
    tensorflow==2.15.0 \
    transformers==4.37.2 \
    sentence-transformers==2.7.0

Looking in indexes: https://pypi.org/simple, https://nexus.corp.indeed.com/repository/pypi/simple
Collecting pandas==2.1.4
  Downloading https://nexus.corp.indeed.com/repository/pypi/packages/pandas/2.1.4/pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scipy==1.11.4
  Downloading https://nexus.corp.indeed.com/repository/pypi/packages/scipy/1.11.4/scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scikit-learn==1.3.2
  Downloading https://nexus.corp.indeed.com/repository/pypi/packages/scikit-learn/1.3.2/scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━

In [1]:
import numpy as np
import pandas as pd
print(f"NumPy version: {np.__version__}")  # Should be 1.26.4
print(f"Pandas version: {pd.__version__}")  # Should be 2.1.4

NumPy version: 1.26.4
Pandas version: 2.1.4


In [6]:
#PROPER - e5_mnrl_DDP_with_prefixes_4gpu/epoch_1
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch
import gc

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Set max sequence length to 256 (increased from 128)
max_seq_length = 256

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_with_prefixes/epoch_2_subset"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
model.max_seq_length = max_seq_length  # Set max sequence length
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create properly formatted passages for the legacy attributes
# Format: "passage: [label]. [definition]" to avoid nested prefixes
legacy_attributes_with_prefix = []
for i in range(len(legacy_df)):
    label = legacy_df['label'].iloc[i]
    definition = legacy_df['definition'].iloc[i]
    prefixed_text = f"passage: {label}. {definition}"
    legacy_attributes_with_prefix.append(prefixed_text)

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Format queries with proper prefix
sentences_with_prefix = [f"query: {sentence}" for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    # Ensure normalization is consistent with training
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Modified evaluation approach that uses relative ranking instead of absolute thresholds
def get_metrics_by_ranking(K, top_n_results):
    """
    Evaluate using relative ranking - take top N results per query regardless of score
    
    Args:
        K: Number of results to retrieve with FAISS
        top_n_results: Number of top results to consider as "positive" predictions
    """
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results using ranking approach
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels from top N results regardless of similarity score
        for j in range(min(top_n_results, len(I_faiss[i]))):
            idx_pos = I_faiss[i][j]
            if idx_pos >= 0:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Keep the original threshold-based evaluation for comparison
def get_metrics_by_threshold(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test both evaluation approaches
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]  # Keeping your original thresholds
top_n_values = [1, 3, 5, 10, 15, 20, 30, 50, 80, 100]  # Top N results to consider

print("\n=== EVALUATION APPROACH 1: THRESHOLD-BASED ===")
threshold_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics_by_threshold(K, threshold)
        threshold_results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

print("\n=== EVALUATION APPROACH 2: RANKING-BASED ===")
ranking_results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for top_n in top_n_values:
        if top_n <= K:  # Can't have top_n greater than K
            avg_precision, avg_recall, avg_f1 = get_metrics_by_ranking(K, top_n)
            ranking_results.append({
                'K': K,
                'Top_N': top_n,
                'Precision': avg_precision,
                'Recall': avg_recall,
                'F1': avg_f1
            })
            print(f"K={K}, Top_N={top_n}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configurations
best_threshold_recall = max(threshold_results, key=lambda x: x['Recall'])
best_ranking_recall = max(ranking_results, key=lambda x: x['Recall'])
best_threshold_f1 = max(threshold_results, key=lambda x: x['F1'])
best_ranking_f1 = max(ranking_results, key=lambda x: x['F1'])

print("\n=== BEST CONFIGURATIONS ===")
print("\nBest Threshold Configuration for Recall:")
print(f"K={best_threshold_recall['K']}, T={best_threshold_recall['Threshold']}")
print(f"Precision={best_threshold_recall['Precision']:.4f}, Recall={best_threshold_recall['Recall']:.4f}, F1={best_threshold_recall['F1']:.4f}")

print("\nBest Ranking Configuration for Recall:")
print(f"K={best_ranking_recall['K']}, Top_N={best_ranking_recall['Top_N']}")
print(f"Precision={best_ranking_recall['Precision']:.4f}, Recall={best_ranking_recall['Recall']:.4f}, F1={best_ranking_recall['F1']:.4f}")

print("\nBest Threshold Configuration for F1:")
print(f"K={best_threshold_f1['K']}, T={best_threshold_f1['Threshold']}")
print(f"Precision={best_threshold_f1['Precision']:.4f}, Recall={best_threshold_f1['Recall']:.4f}, F1={best_threshold_f1['F1']:.4f}")

print("\nBest Ranking Configuration for F1:")
print(f"K={best_ranking_f1['K']}, Top_N={best_ranking_f1['Top_N']}")
print(f"Precision={best_ranking_f1['Precision']:.4f}, Recall={best_ranking_f1['Recall']:.4f}, F1={best_ranking_f1['F1']:.4f}")

# Compare overall best performance between methods
print("\n=== COMPARISON OF METHODS ===")
if best_ranking_recall['Recall'] > best_threshold_recall['Recall']:
    print("Ranking-based method achieved better recall")
else:
    print("Threshold-based method achieved better recall")

if best_ranking_f1['F1'] > best_threshold_f1['F1']:
    print("Ranking-based method achieved better F1 score")
else:
    print("Threshold-based method achieved better F1 score")

Using GPU for SentenceTransformer: NVIDIA A10G

=== EVALUATION APPROACH 1: THRESHOLD-BASED ===

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.0455, Recall=0.0008, F1=0.0017
K=30, T=0.85: Precision=0.2453, Recall=0.0063, F1=0.0122
K=30, T=0.8: Precision=0.3056, Recall=0.0166, F1=0.0304
K=30, T=0.7: Precision=0.3163, Recall=0.0727, F1=0.1027
K=30, T=0.5: Precision=0.1071, Recall=0.3985, F1=0.1574

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.0455, Recall=0.0008, F1=0.0017
K=50, T=0.85: Precision=0.2453, Recall=0.0063, F1=0.0122
K=50, T=0.8: Precision=0.3056, Recall=0.0166, F1=0.0304
K=50, T=0.7: Precision=0.3160, Recall=0.0727, F1=0.1025
K=50, T=0.5: Precision=0.0974, Recall=0.4127, F1=0.1460

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.0455, Recall=0.0008, F1=0.0017
K=80, T=0.85: Precision=0.2453, Recall=0.0063, F1=0.0122
K=80, T=0.8: Precision=0.3056, Recall=0.0166, F1=0.0304
K=80, T=0.7: Precision=0.3160, Recall=0.0727, F1=0.1025
K=80, T=0.5: Precision=0.0925, Recall

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_with_prefixes_4gpu/epoch_1"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Add passage prefix to legacy attributes (these are documents to be retrieved)
legacy_attributes_with_prefix = ["passage: " + attr for attr in legacy_attributes]

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes_with_prefix), batch_size):
    batch = legacy_attributes_with_prefix[i:i+batch_size]
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Add query prefix to sentences (these are queries searching for matching passages)
sentences_with_prefix = ["query: " + sentence for sentence in sentences_df['Sentences']]

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_with_prefix), batch_size):
    batch = sentences_with_prefix[i:i+batch_size]
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(normalize_attribute(label))  # Normalize here too
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(job_predictions.get(job_id, []))  # Already normalized
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

  warn(
2025-04-22 16:45:49.361898: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-22 16:45:49.361930: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-22 16:45:49.361954: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-22 16:45:49.370433: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.2547, Recall=0.0196, F1=0.0342
K=30, T=0.85: Precision=0.2321, Recall=0.0499, F1=0.0729
K=30, T=0.8: Precision=0.1745, Recall=0.0964, F1=0.1092
K=30, T=0.7: Precision=0.0996, Recall=0.2277, F1=0.1271
K=30, T=0.5: Precision=0.0476, Recall=0.4512, F1=0.0836

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.2547, Recall=0.0196, F1=0.0342
K=50, T=0.85: Precision=0.2314, Recall=0.0499, F1=0.0727
K=50, T=0.8: Precision=0.1708, Recall=0.0970, F1=0.1074
K=50, T=0.7: Precision=0.0937, Recall=0.2356, F1=0.1222
K=50, T=0.5: Precision=0.0375, Recall=0.5040, F1=0.0681

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.2547, Recall=0.0196, F1=0.0342
K=80, T=0.85: Precision=0.2314, Recall=0.0499, F1=0.0727
K=80, T=0.8: Precision=0.1699, Recall=0.0972, F1=0.1068
K=80, T=0.7: Precision=0.0909, Recall=0.2399, F1=0.1194
K=80, T=0.5: Precision=0.0310, Recall=0.5483, F1=0.0573

=== Testing with K=100 ===
K

In [30]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Concatenate label and definition
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']
legacy_embeddings = model.encode(legacy_attributes.tolist(), normalize_embeddings=True)

# Build FAISS index with legacy embeddings
dimension = legacy_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
print("Columns in golden_dataset_sentences.csv:", sentences_df.columns.tolist())

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

sentence_embeddings = model.encode(sentences_df['Sentences'].tolist(), normalize_embeddings=True)

# Set hyperparameters
K = 100
T = 0.6

# --- FAISS Mechanism ---
D_faiss, I_faiss = index.search(sentence_embeddings, K)
extracted_labels_faiss = []
for i in range(len(sentences_df)):
    top_k_indices = I_faiss[i]
    top_k_similarities = D_faiss[i]
    labels = [legacy_df['label'].iloc[idx] for idx, sim in zip(top_k_indices, top_k_similarities) if sim > T]
    extracted_labels_faiss.append(labels)
sentences_df['extracted_labels_faiss'] = extracted_labels_faiss
predicted_attributes_faiss = sentences_df.groupby('job_id')['extracted_labels_faiss'].apply(lambda x: list(set(sum(x, [])))).to_dict()

# Debug: Print predicted labels for job_id '5626705964'
if '5626705964' in predicted_attributes_faiss:
    print("FAISS Predicted Labels for job_id '5626705964':", predicted_attributes_faiss['5626705964'])

# --- Direct Cosine Similarity Mechanism ---
similarity_matrix = np.dot(sentence_embeddings, legacy_embeddings.T)
top_k_indices_cosine = np.argsort(similarity_matrix, axis=1)[:, -K:][:, ::-1]
top_k_similarities_cosine = np.take_along_axis(similarity_matrix, top_k_indices_cosine, axis=1)
extracted_labels_cosine = []
for i in range(len(sentences_df)):
    indices = top_k_indices_cosine[i]
    similarities = top_k_similarities_cosine[i]
    labels = [legacy_df['label'].iloc[idx] for idx, sim in zip(indices, similarities) if sim > T]
    extracted_labels_cosine.append(labels)
sentences_df['extracted_labels_cosine'] = extracted_labels_cosine
predicted_attributes_cosine = sentences_df.groupby('job_id')['extracted_labels_cosine'].apply(lambda x: list(set(sum(x, [])))).to_dict()

# Debug: Print predicted labels for job_id '5626705964'
if '5626705964' in predicted_attributes_cosine:
    print("Cosine Predicted Labels for job_id '5626705964':", predicted_attributes_cosine['5626705964'])

# --- Evaluation ---
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')
print("Columns in golden_dataset_labels.csv:", labels_df.columns.tolist())

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

# Debug: Print ground truth for job_id '5626705964'
if '5626705964' in ground_truth:
    print("Ground Truth Labels for job_id '5626705964':", ground_truth['5626705964'])

def normalize_attribute(attr):
    return attr.lower().strip()

output_data = []
for job_id in ground_truth:
    gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
    pred_faiss = set(normalize_attribute(attr) for attr in predicted_attributes_faiss.get(job_id, []))
    tp_faiss = len(gt.intersection(pred_faiss))
    fp_faiss = len(pred_faiss - gt)
    fn_faiss = len(gt - pred_faiss)
    precision_faiss = tp_faiss / (tp_faiss + fp_faiss) if tp_faiss + fp_faiss > 0 else 0
    recall_faiss = tp_faiss / (tp_faiss + fn_faiss) if tp_faiss + fn_faiss > 0 else 0
    
    pred_cosine = set(normalize_attribute(attr) for attr in predicted_attributes_cosine.get(job_id, []))
    tp_cosine = len(gt.intersection(pred_cosine))
    fp_cosine = len(pred_cosine - gt)
    fn_cosine = len(gt - pred_cosine)
    precision_cosine = tp_cosine / (tp_cosine + fp_cosine) if tp_cosine + fp_cosine > 0 else 0
    recall_cosine = tp_cosine / (tp_cosine + fn_cosine) if tp_cosine + fn_cosine > 0 else 0
    
    output_data.append({
        "job_id": job_id,
        "ground_truth": ", ".join(sorted(ground_truth[job_id])),
        "predicted_faiss": ", ".join(sorted(predicted_attributes_faiss.get(job_id, []))),
        "predicted_cosine": ", ".join(sorted(predicted_attributes_cosine.get(job_id, []))),
        "precision_faiss": precision_faiss,
        "recall_faiss": recall_faiss,
        "precision_cosine": precision_cosine,
        "recall_cosine": recall_cosine
    })

output_df = pd.DataFrame(output_data)

# Ensure the output directory exists
output_dir = "/home/jovyan/butterfly/src/notebooks/part3_evaluations"
os.makedirs(output_dir, exist_ok=True)

# Save the CSV file
output_path = os.path.join(output_dir, "prediction_results.csv")
output_df.to_csv(output_path, index=False)
print(f"CSV file saved to: {output_path}")

print(f"FAISS - Average Precision: {output_df['precision_faiss'].mean():.4f}, Average Recall: {output_df['recall_faiss'].mean():.4f}")
print(f"Cosine - Average Precision: {output_df['precision_cosine'].mean():.4f}, Average Recall: {output_df['recall_cosine'].mean():.4f}")

Columns in golden_dataset_sentences.csv: ['Job_ID', 'Job_Text', 'Sentences']
Columns in golden_dataset_labels.csv: ['job_id', 'uuid', 'suid', 'display_name', 'text_snippet', 'start_index', 'labeling_task', 'label', 'isTitleSnippet']
CSV file saved to: /home/jovyan/butterfly/src/notebooks/part3_evaluations/prediction_results.csv
FAISS - Average Precision: 0.0252, Average Recall: 0.5336
Cosine - Average Precision: 0.0252, Average Recall: 0.5336


In [24]:
import pandas as pd

# Load the datasets
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Extract labels from legacy.tsv
legacy_labels = set(legacy_df['label'].str.lower().str.strip())

# Extract positive display_name values from golden_dataset_labels.csv
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
golden_labels = set(positive_labels['display_name'].str.lower().str.strip())

# Compare the labels
common_labels = legacy_labels.intersection(golden_labels)
golden_not_in_legacy = golden_labels - legacy_labels
legacy_not_in_golden = legacy_labels - golden_labels

# Print the results
print(f"Total labels in legacy.tsv: {len(legacy_labels)}")
print(f"Total unique display_name values in golden_dataset_labels.csv (POSITIVE): {len(golden_labels)}")
print(f"\nCommon labels (present in both): {len(common_labels)}")
print(f"\nLabels in golden_dataset_labels.csv but not in legacy.tsv: {len(golden_not_in_legacy)}")
#print(sorted(golden_not_in_legacy))
print(f"\nLabels in legacy.tsv but not in golden_dataset_labels.csv: {len(legacy_not_in_golden)}")


Total labels in legacy.tsv: 23774
Total unique display_name values in golden_dataset_labels.csv (POSITIVE): 4251

Common labels (present in both): 3914

Labels in golden_dataset_labels.csv but not in legacy.tsv: 337

Labels in legacy.tsv but not in golden_dataset_labels.csv: 19860


In [26]:
# Load and inspect defs.tsv
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')
print("Number of NaN values in 'label' column:", legacy_df['label'].isna().sum())
print("Number of NaN values in 'definition' column:", legacy_df['definition'].isna().sum())
print("Rows with NaN in either 'label' or 'definition':")
print(legacy_df[legacy_df['label'].isna() | legacy_df['definition'].isna()])

Number of NaN values in 'label' column: 0
Number of NaN values in 'definition' column: 1647
Rows with NaN in either 'label' or 'definition':
        suid                                              label definition
16225  JBGTS         Veterinary medicine experience (1-2 years)        NaN
17857  239HK       Database architecture experience (1-2 years)        NaN
18162  39PWM  Highest single-project number of utility compa...        NaN
20630  UXBHS  Largest technology infrastructure engineering ...        NaN
20631  BY7HE  Veterinary vaccination programs developed (2-3...        NaN
...      ...                                                ...        ...
24413  MMR9X  Long-term front-end development projects contr...        NaN
24414  GJF3M  Largest medical device development budget mana...        NaN
24415  BWKGW  Contributing to long-term front-end developmen...        NaN
24416  772ZE     Implementing new investor relations strategies        NaN
24417  Q4MGA                   Bio

In [31]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
from collections import Counter

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Concatenate label and definition
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']
legacy_embeddings = model.encode(legacy_attributes.tolist(), normalize_embeddings=True)

# Build FAISS index with legacy embeddings
dimension = legacy_embeddings.shape[1]
# Use IndexFlatIP for inner product (cosine similarity since vectors are normalized)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
print("Columns in golden_dataset_sentences.csv:", sentences_df.columns.tolist())

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences
sentence_embeddings = model.encode(sentences_df['Sentences'].tolist(), normalize_embeddings=True)

# Set hyperparameters - MODIFIED
K = 200  # Increased from 100 to capture more potential matches
T = 0.4   # Lowered threshold from 0.6 to 0.4 to increase recall

# --- FAISS Mechanism ---
D_faiss, I_faiss = index.search(sentence_embeddings, K)
extracted_labels_faiss = []
extracted_scores_faiss = []

for i in range(len(sentences_df)):
    top_k_indices = I_faiss[i]
    top_k_similarities = D_faiss[i]
    
    # Store labels and their similarity scores
    labels_with_scores = [(legacy_df['label'].iloc[idx], sim) 
                          for idx, sim in zip(top_k_indices, top_k_similarities) 
                          if sim > T]
    
    # Extract just the labels
    labels = [label for label, _ in labels_with_scores]
    scores = [score for _, score in labels_with_scores]
    
    extracted_labels_faiss.append(labels)
    extracted_scores_faiss.append(scores)

sentences_df['extracted_labels_faiss'] = extracted_labels_faiss
sentences_df['extracted_scores_faiss'] = extracted_scores_faiss

# Implement a weighted voting mechanism for job-level aggregation
job_predictions_faiss = {}
for job_id in sentences_df['job_id'].unique():
    job_rows = sentences_df[sentences_df['job_id'] == job_id]
    
    # Collect all labels with their scores across all sentences for this job
    all_labels_with_scores = []
    for labels, scores in zip(job_rows['extracted_labels_faiss'], job_rows['extracted_scores_faiss']):
        all_labels_with_scores.extend(list(zip(labels, scores)))
    
    # Group by label and calculate weighted score
    label_scores = {}
    for label, score in all_labels_with_scores:
        if label in label_scores:
            label_scores[label] += score
        else:
            label_scores[label] = score
    
    # Sort by score and take all labels (no filtering at job level)
    sorted_labels = sorted(label_scores.items(), key=lambda x: x[1], reverse=True)
    job_predictions_faiss[job_id] = [label for label, _ in sorted_labels]

# --- Evaluation ---
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')
print("Columns in golden_dataset_labels.csv:", labels_df.columns.tolist())

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

# Debug: Print ground truth for job_id '5626705964'
if '5626705964' in ground_truth:
    print("Ground Truth Labels for job_id '5626705964':", ground_truth['5626705964'])
    print("Predicted Labels for job_id '5626705964':", job_predictions_faiss.get('5626705964', []))

def normalize_attribute(attr):
    return attr.lower().strip()

output_data = []
for job_id in ground_truth:
    gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
    
    pred_faiss = set(normalize_attribute(attr) for attr in job_predictions_faiss.get(job_id, []))
    tp_faiss = len(gt.intersection(pred_faiss))
    fp_faiss = len(pred_faiss - gt)
    fn_faiss = len(gt - pred_faiss)
    precision_faiss = tp_faiss / (tp_faiss + fp_faiss) if tp_faiss + fp_faiss > 0 else 0
    recall_faiss = tp_faiss / (tp_faiss + fn_faiss) if tp_faiss + fn_faiss > 0 else 0
    
    output_data.append({
        "job_id": job_id,
        "ground_truth": ", ".join(sorted(ground_truth[job_id])),
        "predicted_faiss": ", ".join(sorted(job_predictions_faiss.get(job_id, []))),
        "precision_faiss": precision_faiss,
        "recall_faiss": recall_faiss,
        "f1_faiss": 2 * precision_faiss * recall_faiss / (precision_faiss + recall_faiss) if precision_faiss + recall_faiss > 0 else 0
    })

output_df = pd.DataFrame(output_data)

# Ensure the output directory exists
output_dir = "/home/jovyan/butterfly/src/notebooks/part3_evaluations"
os.makedirs(output_dir, exist_ok=True)

# Save the CSV file
output_path = os.path.join(output_dir, "prediction_results_improved.csv")
output_df.to_csv(output_path, index=False)
print(f"CSV file saved to: {output_path}")

print(f"FAISS - Average Precision: {output_df['precision_faiss'].mean():.4f}, Average Recall: {output_df['recall_faiss'].mean():.4f}")

# If recall is still not high enough, we could implement an even more aggressive approach
if output_df['recall_faiss'].mean() < 0.9:
    print("Recall still below target, implementing more aggressive strategy...")
    
    # Very low threshold approach
    T_aggressive = 0.2  # Even lower threshold
    K_aggressive = 300  # Even more candidates
    
    D_faiss_agg, I_faiss_agg = index.search(sentence_embeddings, K_aggressive)
    job_predictions_aggressive = {}
    
    for job_id in sentences_df['job_id'].unique():
        job_indices = sentences_df[sentences_df['job_id'] == job_id].index.tolist()
        
        # Collect all labels with their scores across all sentences for this job
        all_labels = []
        for i in job_indices:
            labels = [legacy_df['label'].iloc[idx] for idx, sim in 
                      zip(I_faiss_agg[i], D_faiss_agg[i]) if sim > T_aggressive]
            all_labels.extend(labels)
        
        # Count frequency of each label
        label_counts = Counter(all_labels)
        
        # Take all labels that appear at least once
        job_predictions_aggressive[job_id] = list(label_counts.keys())
    
    # Evaluate aggressive approach
    output_data_aggressive = []
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred_agg = set(normalize_attribute(attr) for attr in job_predictions_aggressive.get(job_id, []))
        tp_agg = len(gt.intersection(pred_agg))
        fp_agg = len(pred_agg - gt)
        fn_agg = len(gt - pred_agg)
        precision_agg = tp_agg / (tp_agg + fp_agg) if tp_agg + fp_agg > 0 else 0
        recall_agg = tp_agg / (tp_agg + fn_agg) if tp_agg + fn_agg > 0 else 0
        
        output_data_aggressive.append({
            "job_id": job_id,
            "ground_truth": ", ".join(sorted(ground_truth[job_id])),
            "predicted_aggressive": ", ".join(sorted(job_predictions_aggressive.get(job_id, []))),
            "precision_aggressive": precision_agg,
            "recall_aggressive": recall_agg,
            "f1_aggressive": 2 * precision_agg * recall_agg / (precision_agg + recall_agg) if precision_agg + recall_agg > 0 else 0
        })
    
    output_df_aggressive = pd.DataFrame(output_data_aggressive)
    output_path_aggressive = os.path.join(output_dir, "prediction_results_aggressive.csv")
    output_df_aggressive.to_csv(output_path_aggressive, index=False)
    
    print(f"Aggressive FAISS - Average Precision: {output_df_aggressive['precision_aggressive'].mean():.4f}, Average Recall: {output_df_aggressive['recall_aggressive'].mean():.4f}")

Columns in golden_dataset_sentences.csv: ['Job_ID', 'Job_Text', 'Sentences']
Columns in golden_dataset_labels.csv: ['job_id', 'uuid', 'suid', 'display_name', 'text_snippet', 'start_index', 'labeling_task', 'label', 'isTitleSnippet']
CSV file saved to: /home/jovyan/butterfly/src/notebooks/part3_evaluations/prediction_results_improved.csv
FAISS - Average Precision: 0.0144, Average Recall: 0.6944
Recall still below target, implementing more aggressive strategy...
Aggressive FAISS - Average Precision: 0.0112, Average Recall: 0.7464


In [36]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
print("Loading SentenceTransformer model...")
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
print("Loading definition data...")
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
print(f"Encoding {len(legacy_attributes)} legacy attributes...")
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)
    if (i+batch_size) % 1000 == 0 or i+batch_size >= len(legacy_attributes):
        print(f"  Encoded {min(i+batch_size, len(legacy_attributes))}/{len(legacy_attributes)} attributes")

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]
print(f"Creating FAISS index with dimension {dimension}...")

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
print("Loading sentence data...")
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')
print("Columns in golden_dataset_sentences.csv:", sentences_df.columns.tolist())

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
print(f"Encoding {len(sentences_df)} sentences...")
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)
    if (i+batch_size) % 1000 == 0 or i+batch_size >= len(sentences_df):
        print(f"  Encoded {min(i+batch_size, len(sentences_df))}/{len(sentences_df)} sentences")

sentence_embeddings = np.vstack(sentence_embeddings)

# Optimized for recall: use high K, low threshold
K = 200  # Number of candidates to retrieve per sentence
T = 0.3  # Lower threshold to improve recall

print(f"Performing FAISS search with K={K} and threshold={T}...")
D_faiss, I_faiss = index.search(sentence_embeddings, K)

# Process results efficiently
job_predictions = {}
for i, row in enumerate(sentences_df.itertuples()):
    job_id = row.job_id
    
    if job_id not in job_predictions:
        job_predictions[job_id] = set()
    
    # Get labels with similarity above threshold
    for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
        if idx_pos >= 0 and similarity > T:
            label = legacy_df['label'].iloc[idx_pos]
            job_predictions[job_id].add(label)
    
# Convert sets to lists
job_predictions = {job_id: list(labels) for job_id, labels in job_predictions.items()}

# --- Evaluation ---
print("Loading ground truth data...")
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')
print("Columns in golden_dataset_labels.csv:", labels_df.columns.tolist())

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

# Sample for debugging
if ground_truth:
    sample_job_id = next(iter(ground_truth))
    print(f"Sample job {sample_job_id}:")
    print(f"  Ground truth: {ground_truth[sample_job_id]}")
    print(f"  Predicted: {job_predictions.get(sample_job_id, [])}")

# Calculate metrics
def normalize_attribute(attr):
    return attr.lower().strip()

output_data = []
for job_id in ground_truth:
    gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
    
    pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
    tp = len(gt.intersection(pred))
    fp = len(pred - gt)
    fn = len(gt - pred)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    output_data.append({
        "job_id": job_id,
        "ground_truth": ", ".join(sorted(ground_truth[job_id])),
        "predicted": ", ".join(sorted(job_predictions.get(job_id, []))),
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

output_df = pd.DataFrame(output_data)

# Ensure the output directory exists
output_dir = "/home/jovyan/butterfly/src/notebooks/part3_evaluations"
os.makedirs(output_dir, exist_ok=True)

# Save the CSV file
output_path = os.path.join(output_dir, "prediction_results_high_recall.csv")
output_df.to_csv(output_path, index=False)
print(f"CSV file saved to: {output_path}")

avg_precision = output_df['precision'].mean()
avg_recall = output_df['recall'].mean()
avg_f1 = output_df['f1'].mean()

print(f"Final Results - Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average F1: {avg_f1:.4f}")

# If recall is still below target, adjust threshold and rerun
if avg_recall < 0.9:
    print("Recall below target, adjusting threshold and rerunning...")
    
    # Try with an even lower threshold
    T_aggressive = 0.2
    
    job_predictions_aggressive = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions_aggressive:
            job_predictions_aggressive[job_id] = set()
        
        # Get labels with similarity above lower threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > T_aggressive:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions_aggressive[job_id].add(label)
    
    # Convert sets to lists
    job_predictions_aggressive = {job_id: list(labels) for job_id, labels in job_predictions_aggressive.items()}
    
    # Calculate metrics for aggressive approach
    output_data_aggressive = []
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions_aggressive.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        output_data_aggressive.append({
            "job_id": job_id,
            "ground_truth": ", ".join(sorted(ground_truth[job_id])),
            "predicted": ", ".join(sorted(job_predictions_aggressive.get(job_id, []))),
            "precision": precision,
            "recall": recall,
            "f1": f1
        })
    
    output_df_aggressive = pd.DataFrame(output_data_aggressive)
    
    # Save the CSV file for aggressive approach
    output_path_aggressive = os.path.join(output_dir, "prediction_results_very_high_recall.csv")
    output_df_aggressive.to_csv(output_path_aggressive, index=False)
    
    avg_precision_agg = output_df_aggressive['precision'].mean()
    avg_recall_agg = output_df_aggressive['recall'].mean()
    avg_f1_agg = output_df_aggressive['f1'].mean()
    
    print(f"Aggressive Results - Average Precision: {avg_precision_agg:.4f}, Average Recall: {avg_recall_agg:.4f}, Average F1: {avg_f1_agg:.4f}")
    
    # Pick the best approach based on recall target
    if avg_recall_agg >= 0.9:
        print("Using aggressive approach to meet recall target")
        # You may want to save this as your final result if the recall target is crucial

Using GPU for SentenceTransformer: NVIDIA A10G
Loading SentenceTransformer model...
Loading definition data...
Encoding 24589 legacy attributes...
  Encoded 16000/24589 attributes
  Encoded 24589/24589 attributes
Creating FAISS index with dimension 768...
Loading sentence data...
Columns in golden_dataset_sentences.csv: ['Job_ID', 'Job_Text', 'Sentences']
Encoding 7943 sentences...
  Encoded 7943/7943 sentences
Performing FAISS search with K=200 and threshold=0.3...
Loading ground truth data...
Columns in golden_dataset_labels.csv: ['job_id', 'uuid', 'suid', 'display_name', 'text_snippet', 'start_index', 'labeling_task', 'label', 'isTitleSnippet']
Sample job 5577404285:
  Ground truth: ['ability to lift 50 pounds', 'computer skills', 'dealership', 'detailing', 'email', 'employee discount', 'full-time', 'motorcycle license', 'porter experience']
  Predicted: ['Paper production line team management (1-5 employees)', 'Car dealership', 'Gulfstream G650 operation', 'Arkansas', 'Internal job

In [37]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Optimized for recall: use high K, low threshold
K = 200  # Number of candidates to retrieve per sentence
T = 0.3  # Lower threshold to improve recall

D_faiss, I_faiss = index.search(sentence_embeddings, K)

# Process results efficiently
job_predictions = {}
for i, row in enumerate(sentences_df.itertuples()):
    job_id = row.job_id
    
    if job_id not in job_predictions:
        job_predictions[job_id] = set()
    
    # Get labels with similarity above threshold
    for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
        if idx_pos >= 0 and similarity > T:
            label = legacy_df['label'].iloc[idx_pos]
            job_predictions[job_id].add(label)
    
# Convert sets to lists
job_predictions = {job_id: list(labels) for job_id, labels in job_predictions.items()}

# --- Evaluation ---
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

# Calculate metrics
def normalize_attribute(attr):
    return attr.lower().strip()

output_data = []
for job_id in ground_truth:
    gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
    
    pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
    tp = len(gt.intersection(pred))
    fp = len(pred - gt)
    fn = len(gt - pred)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    output_data.append({
        "job_id": job_id,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

output_df = pd.DataFrame(output_data)

avg_precision = output_df['precision'].mean()
avg_recall = output_df['recall'].mean()
avg_f1 = output_df['f1'].mean()

print(f"Results with T={T} - Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average F1: {avg_f1:.4f}")

# If recall is still below target, adjust threshold and rerun
if avg_recall < 0.9:
    # Try with an even lower threshold
    T_aggressive = 0.2
    
    job_predictions_aggressive = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions_aggressive:
            job_predictions_aggressive[job_id] = set()
        
        # Get labels with similarity above lower threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > T_aggressive:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions_aggressive[job_id].add(label)
    
    # Convert sets to lists
    job_predictions_aggressive = {job_id: list(labels) for job_id, labels in job_predictions_aggressive.items()}
    
    # Calculate metrics for aggressive approach
    output_data_aggressive = []
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions_aggressive.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        output_data_aggressive.append({
            "job_id": job_id,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })
    
    output_df_aggressive = pd.DataFrame(output_data_aggressive)
    
    avg_precision_agg = output_df_aggressive['precision'].mean()
    avg_recall_agg = output_df_aggressive['recall'].mean() 
    avg_f1_agg = output_df_aggressive['f1'].mean()
    
    print(f"Results with T={T_aggressive} - Average Precision: {avg_precision_agg:.4f}, Average Recall: {avg_recall_agg:.4f}, Average F1: {avg_f1_agg:.4f}")
    
    # If we need even more recall, try an extremely low threshold
    if avg_recall_agg < 0.9:
        T_extreme = 0.1
        
        job_predictions_extreme = {}
        for i, row in enumerate(sentences_df.itertuples()):
            job_id = row.job_id
            
            if job_id not in job_predictions_extreme:
                job_predictions_extreme[job_id] = set()
            
            # Get labels with similarity above lower threshold
            for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
                if idx_pos >= 0 and similarity > T_extreme:
                    label = legacy_df['label'].iloc[idx_pos]
                    job_predictions_extreme[job_id].add(label)
        
        # Convert sets to lists
        job_predictions_extreme = {job_id: list(labels) for job_id, labels in job_predictions_extreme.items()}
        
        # Calculate metrics for extreme approach
        output_data_extreme = []
        for job_id in ground_truth:
            gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
            
            pred = set(normalize_attribute(attr) for attr in job_predictions_extreme.get(job_id, []))
            tp = len(gt.intersection(pred))
            fp = len(pred - gt)
            fn = len(gt - pred)
            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            
            output_data_extreme.append({
                "job_id": job_id,
                "precision": precision,
                "recall": recall,
                "f1": f1
            })
        
        output_df_extreme = pd.DataFrame(output_data_extreme)
        
        avg_precision_ext = output_df_extreme['precision'].mean()
        avg_recall_ext = output_df_extreme['recall'].mean()
        avg_f1_ext = output_df_extreme['f1'].mean()
        
        print(f"Results with T={T_extreme} - Average Precision: {avg_precision_ext:.4f}, Average Recall: {avg_recall_ext:.4f}, Average F1: {avg_f1_ext:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G
Results with T=0.3 - Average Precision: 0.0141, Average Recall: 0.7003, Average F1: 0.0275
Results with T=0.2 - Average Precision: 0.0141, Average Recall: 0.7003, Average F1: 0.0275
Results with T=0.1 - Average Precision: 0.0141, Average Recall: 0.7003, Average F1: 0.0275


In [40]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_optimized/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Optimized for recall: use high K, low threshold
K = 200  # Number of candidates to retrieve per sentence
T = 0.3  # Lower threshold to improve recall

D_faiss, I_faiss = index.search(sentence_embeddings, K)

# Process results efficiently
job_predictions = {}
for i, row in enumerate(sentences_df.itertuples()):
    job_id = row.job_id
    
    if job_id not in job_predictions:
        job_predictions[job_id] = set()
    
    # Get labels with similarity above threshold
    for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
        if idx_pos >= 0 and similarity > T:
            label = legacy_df['label'].iloc[idx_pos]
            job_predictions[job_id].add(label)
    
# Convert sets to lists
job_predictions = {job_id: list(labels) for job_id, labels in job_predictions.items()}

# --- Evaluation ---
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

# Calculate metrics
def normalize_attribute(attr):
    return attr.lower().strip()

output_data = []
for job_id in ground_truth:
    gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
    
    pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
    tp = len(gt.intersection(pred))
    fp = len(pred - gt)
    fn = len(gt - pred)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    output_data.append({
        "job_id": job_id,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

output_df = pd.DataFrame(output_data)

avg_precision = output_df['precision'].mean()
avg_recall = output_df['recall'].mean()
avg_f1 = output_df['f1'].mean()

print(f"Results with T={T} - Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average F1: {avg_f1:.4f}")

# If recall is still below target, adjust threshold and rerun
if avg_recall < 0.9:
    # Try with an even lower threshold
    T_aggressive = 0.2
    
    job_predictions_aggressive = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions_aggressive:
            job_predictions_aggressive[job_id] = set()
        
        # Get labels with similarity above lower threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > T_aggressive:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions_aggressive[job_id].add(label)
    
    # Convert sets to lists
    job_predictions_aggressive = {job_id: list(labels) for job_id, labels in job_predictions_aggressive.items()}
    
    # Calculate metrics for aggressive approach
    output_data_aggressive = []
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions_aggressive.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        output_data_aggressive.append({
            "job_id": job_id,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })
    
    output_df_aggressive = pd.DataFrame(output_data_aggressive)
    
    avg_precision_agg = output_df_aggressive['precision'].mean()
    avg_recall_agg = output_df_aggressive['recall'].mean() 
    avg_f1_agg = output_df_aggressive['f1'].mean()
    
    print(f"Results with T={T_aggressive} - Average Precision: {avg_precision_agg:.4f}, Average Recall: {avg_recall_agg:.4f}, Average F1: {avg_f1_agg:.4f}")
    
    # If we need even more recall, try an extremely low threshold
    if avg_recall_agg < 0.9:
        T_extreme = 0.1
        
        job_predictions_extreme = {}
        for i, row in enumerate(sentences_df.itertuples()):
            job_id = row.job_id
            
            if job_id not in job_predictions_extreme:
                job_predictions_extreme[job_id] = set()
            
            # Get labels with similarity above lower threshold
            for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
                if idx_pos >= 0 and similarity > T_extreme:
                    label = legacy_df['label'].iloc[idx_pos]
                    job_predictions_extreme[job_id].add(label)
        
        # Convert sets to lists
        job_predictions_extreme = {job_id: list(labels) for job_id, labels in job_predictions_extreme.items()}
        
        # Calculate metrics for extreme approach
        output_data_extreme = []
        for job_id in ground_truth:
            gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
            
            pred = set(normalize_attribute(attr) for attr in job_predictions_extreme.get(job_id, []))
            tp = len(gt.intersection(pred))
            fp = len(pred - gt)
            fn = len(gt - pred)
            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            
            output_data_extreme.append({
                "job_id": job_id,
                "precision": precision,
                "recall": recall,
                "f1": f1
            })
        
        output_df_extreme = pd.DataFrame(output_data_extreme)
        
        avg_precision_ext = output_df_extreme['precision'].mean()
        avg_recall_ext = output_df_extreme['recall'].mean()
        avg_f1_ext = output_df_extreme['f1'].mean()
        
        print(f"Results with T={T_extreme} - Average Precision: {avg_precision_ext:.4f}, Average Recall: {avg_recall_ext:.4f}, Average F1: {avg_f1_ext:.4f}")

Using GPU for SentenceTransformer: NVIDIA A10G
Results with T=0.3 - Average Precision: 0.0139, Average Recall: 0.7011, Average F1: 0.0270
Results with T=0.2 - Average Precision: 0.0136, Average Recall: 0.7061, Average F1: 0.0266
Results with T=0.1 - Average Precision: 0.0136, Average Recall: 0.7061, Average F1: 0.0266


In [42]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Function to get predictions with a specific threshold
def get_predictions_and_metrics(threshold):
    # Set number of candidates to retrieve - use a higher value for lower thresholds
    K = min(30, len(legacy_df))
    
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results efficiently
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Convert sets to lists
    job_predictions = {job_id: list(labels) for job_id, labels in job_predictions.items()}
    
    # Calculate metrics
    labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')
    
    # Clean and filter ground truth labels
    labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
    positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
    ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()
    
    def normalize_attribute(attr):
        return attr.lower().strip()
    
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    # Print sample predictions for a job to verify threshold is working
    if ground_truth:
        sample_job_id = next(iter(ground_truth))
        pred_count = len(job_predictions.get(sample_job_id, []))
        print(f"Sample job prediction count with T={threshold}: {pred_count}")
    
    return avg_precision, avg_recall, avg_f1, job_predictions

# Try with different thresholds
thresholds = [0.7,0.5, 0.4, 0.3, 0.2, 0.1]
results = {}

for threshold in thresholds:
    print(f"\nTrying threshold: {threshold}")
    avg_precision, avg_recall, avg_f1, predictions = get_predictions_and_metrics(threshold)
    results[threshold] = {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1,
        'predictions': predictions
    }
    print(f"Results with T={threshold} - Average Precision: {avg_precision:.4f}, Average Recall: {avg_recall:.4f}, Average F1: {avg_f1:.4f}")

# Find the threshold that gives at least 0.9 recall, or the highest recall if none meet the target
best_threshold = None
best_recall = -1

for threshold, metrics in results.items():
    if metrics['recall'] >= 0.9 and (best_threshold is None or threshold > best_threshold):
        best_threshold = threshold
        best_recall = metrics['recall']
    elif best_threshold is None and metrics['recall'] > best_recall:
        best_threshold = threshold
        best_recall = metrics['recall']

if best_threshold is not None:
    print(f"\nBest threshold: {best_threshold} with recall: {best_recall:.4f}")
    if best_recall < 0.9:
        print("Note: Unable to achieve 0.9 recall with any tested threshold. You may need to try enhancing your model or data.")
else:
    print("\nUnable to find a suitable threshold. Consider checking your data or model.")

Using GPU for SentenceTransformer: NVIDIA A10G

Trying threshold: 0.7
Sample job prediction count with T=0.7: 49
Results with T=0.7 - Average Precision: 0.0497, Average Recall: 0.3543, Average F1: 0.0825

Trying threshold: 0.5
Sample job prediction count with T=0.5: 138
Results with T=0.5 - Average Precision: 0.0405, Average Recall: 0.4383, Average F1: 0.0718

Trying threshold: 0.4
Sample job prediction count with T=0.4: 138
Results with T=0.4 - Average Precision: 0.0400, Average Recall: 0.4405, Average F1: 0.0711

Trying threshold: 0.3
Sample job prediction count with T=0.3: 138
Results with T=0.3 - Average Precision: 0.0400, Average Recall: 0.4405, Average F1: 0.0711

Trying threshold: 0.2


KeyboardInterrupt: 

e5_mnrl_DDP_test/epoch_3

In [43]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.1655, Recall=0.0888, F1=0.0988
K=30, T=0.85: Precision=0.1091, Recall=0.1625, F1=0.1130
K=30, T=0.8: Precision=0.0741, Recall=0.2352, F1=0.1024
K=30, T=0.7: Precision=0.0497, Recall=0.3543, F1=0.0825
K=30, T=0.5: Precision=0.0405, Recall=0.4383, F1=0.0718

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.1642, Recall=0.0894, F1=0.0983
K=50, T=0.85: Precision=0.1061, Recall=0.1664, F1=0.1110
K=50, T=0.8: Precision=0.0691, Recall=0.2468, F1=0.0974
K=50, T=0.7: Precision=0.0425, Recall=0.3891, F1=0.0724
K=50, T=0.5: Precision=0.0313, Recall=0.5039, F1=0.0575

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.1642, Recall=0.0894, F1=0.0983
K=80, T=0.85: Precision=0.1052, Recall=0.1678, F1=0.1104
K=80, T=0.8: Precision=0.0668, Recall=0.2523, F1=0.0949
K=80, T=0.7: Precision=0.0374, Recall=0.4166, F1=0.0648
K=80, T=0.5: Precision=0.0247, Recall=0.5645, F1=0.0465

=== Testing with K=100 ===
K

e5_mnrl_DDP_test/epoch_2

In [48]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_2"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.1120, Recall=0.1194, F1=0.1002
K=30, T=0.85: Precision=0.0752, Recall=0.1979, F1=0.0946
K=30, T=0.8: Precision=0.0568, Recall=0.2689, F1=0.0881
K=30, T=0.7: Precision=0.0446, Recall=0.3686, F1=0.0753
K=30, T=0.5: Precision=0.0390, Recall=0.4248, F1=0.0695

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.1112, Recall=0.1209, F1=0.1000
K=50, T=0.85: Precision=0.0705, Recall=0.2067, F1=0.0905
K=50, T=0.8: Precision=0.0505, Recall=0.2889, F1=0.0805
K=50, T=0.7: Precision=0.0372, Recall=0.4118, F1=0.0643
K=50, T=0.5: Precision=0.0302, Recall=0.4943, F1=0.0558

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.1110, Recall=0.1211, F1=0.0998
K=80, T=0.85: Precision=0.0692, Recall=0.2090, F1=0.0890
K=80, T=0.8: Precision=0.0459, Recall=0.3006, F1=0.0743
K=80, T=0.7: Precision=0.0317, Recall=0.4459, F1=0.0557
K=80, T=0.5: Precision=0.0238, Recall=0.5576, F1=0.0450

=== Testing with K=100 ===
K

In [50]:
# COSINE SIMILARITY

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test/epoch_2"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Calculate cosine similarity between all sentence and skill embeddings
    similarity_matrix = cosine_similarity(sentence_embeddings, legacy_embeddings)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get top K similarities for this sentence
        top_k_indices = np.argsort(similarity_matrix[i])[-K:][::-1]
        
        # Get labels with similarity above threshold
        for idx in top_k_indices:
            similarity = similarity_matrix[i][idx]
            if similarity > threshold:
                label = legacy_df['label'].iloc[idx]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.1120, Recall=0.1194, F1=0.1002
K=30, T=0.85: Precision=0.0752, Recall=0.1979, F1=0.0946
K=30, T=0.8: Precision=0.0568, Recall=0.2689, F1=0.0881
K=30, T=0.7: Precision=0.0446, Recall=0.3686, F1=0.0753
K=30, T=0.5: Precision=0.0390, Recall=0.4248, F1=0.0695

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.1112, Recall=0.1209, F1=0.1000
K=50, T=0.85: Precision=0.0705, Recall=0.2067, F1=0.0905
K=50, T=0.8: Precision=0.0505, Recall=0.2889, F1=0.0805
K=50, T=0.7: Precision=0.0372, Recall=0.4118, F1=0.0643
K=50, T=0.5: Precision=0.0302, Recall=0.4943, F1=0.0558

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.1110, Recall=0.1211, F1=0.0998
K=80, T=0.85: Precision=0.0692, Recall=0.2090, F1=0.0890
K=80, T=0.8: Precision=0.0459, Recall=0.3006, F1=0.0743
K=80, T=0.7: Precision=0.0317, Recall=0.4459, F1=0.0557
K=80, T=0.5: Precision=0.0238, Recall=0.5576, F1=0.0450

=== Testing with K=100 ===
K

20250408_173921_bi_encoder_finetuned

In [44]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/20250408_173921_bi_encoder_finetuned"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.1784, Recall=0.0105, F1=0.0186
K=30, T=0.85: Precision=0.2715, Recall=0.0334, F1=0.0540
K=30, T=0.8: Precision=0.2350, Recall=0.0684, F1=0.0928
K=30, T=0.7: Precision=0.1415, Recall=0.1668, F1=0.1329
K=30, T=0.5: Precision=0.0442, Recall=0.3797, F1=0.0764

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.1784, Recall=0.0105, F1=0.0186
K=50, T=0.85: Precision=0.2715, Recall=0.0334, F1=0.0540
K=50, T=0.8: Precision=0.2346, Recall=0.0685, F1=0.0927
K=50, T=0.7: Precision=0.1390, Recall=0.1689, F1=0.1312
K=50, T=0.5: Precision=0.0359, Recall=0.4146, F1=0.0641

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.1784, Recall=0.0105, F1=0.0186
K=80, T=0.85: Precision=0.2715, Recall=0.0334, F1=0.0540
K=80, T=0.8: Precision=0.2346, Recall=0.0685, F1=0.0927
K=80, T=0.7: Precision=0.1384, Recall=0.1694, F1=0.1307
K=80, T=0.5: Precision=0.0307, Recall=0.4368, F1=0.0557

=== Testing with K=100 ===
K

e5_mnrl_DDP_test_continued/continued_epoch_1

In [45]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_DDP_test_continued/continued_epoch_1"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.0154, Recall=0.1746, F1=0.0277
K=30, T=0.85: Precision=0.0154, Recall=0.1999, F1=0.0280
K=30, T=0.8: Precision=0.0154, Recall=0.2065, F1=0.0280
K=30, T=0.7: Precision=0.0153, Recall=0.2085, F1=0.0279
K=30, T=0.5: Precision=0.0153, Recall=0.2087, F1=0.0279

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.0130, Recall=0.2090, F1=0.0241
K=50, T=0.85: Precision=0.0130, Recall=0.2466, F1=0.0243
K=50, T=0.8: Precision=0.0129, Recall=0.2572, F1=0.0242
K=50, T=0.7: Precision=0.0128, Recall=0.2603, F1=0.0240
K=50, T=0.5: Precision=0.0128, Recall=0.2606, F1=0.0240

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.0111, Recall=0.2411, F1=0.0209
K=80, T=0.85: Precision=0.0109, Recall=0.2944, F1=0.0208
K=80, T=0.8: Precision=0.0108, Recall=0.3107, F1=0.0207
K=80, T=0.7: Precision=0.0107, Recall=0.3156, F1=0.0205
K=80, T=0.5: Precision=0.0107, Recall=0.3160, F1=0.0205

=== Testing with K=100 ===
K

e5_mnrl_optimized/epoch_3

In [46]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import torch

# Check for GPU availability for the SentenceTransformer model
if torch.cuda.is_available():
    print(f"Using GPU for SentenceTransformer: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("No GPUs available for SentenceTransformer, using CPU")
    device = "cpu"

# Bi encoder model path
bi_encoder_model = "/home/jovyan/butterfly/src/notebooks/e5_mnrl_optimized/epoch_3"

# Load the bi-encoder model
model = SentenceTransformer(bi_encoder_model)
if torch.cuda.is_available():
    model = model.to(device)

# Load and prepare legacy attributes from 'defs.tsv'
legacy_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/defs.tsv', sep='\t')

# Handle NaN values by replacing with empty strings
legacy_df['label'] = legacy_df['label'].fillna('')
legacy_df['definition'] = legacy_df['definition'].fillna('')

# Create label text for semantic search (format: label: definition)
legacy_attributes = legacy_df['label'] + ": " + legacy_df['definition']

# Encode with normalization, batch for large datasets
batch_size = 128
legacy_embeddings = []

for i in range(0, len(legacy_attributes), batch_size):
    batch = legacy_attributes[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    legacy_embeddings.append(batch_embeddings)

legacy_embeddings = np.vstack(legacy_embeddings)

# Create FAISS index
dimension = legacy_embeddings.shape[1]

# Use simple FAISS index optimized for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(legacy_embeddings)

# Load sentences from 'golden_dataset_sentences.csv'
sentences_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_sentences.csv')

# Rename 'Job_ID' to 'job_id' for consistency
sentences_df.rename(columns={'Job_ID': 'job_id'}, inplace=True)

# Encode sentences in batches
sentence_embeddings = []
for i in range(0, len(sentences_df), batch_size):
    batch = sentences_df['Sentences'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, normalize_embeddings=True)
    sentence_embeddings.append(batch_embeddings)

sentence_embeddings = np.vstack(sentence_embeddings)

# Load ground truth data
labels_df = pd.read_csv('/home/jovyan/butterfly/src/notebooks/golden_dataset_labels.csv')

# Clean and filter ground truth labels
labels_df['display_name'] = labels_df['display_name'].str.lower().str.strip()
positive_labels = labels_df[labels_df['label'] == 'POSITIVE']
ground_truth = positive_labels.groupby('job_id')['display_name'].unique().apply(lambda x: sorted(list(x))).to_dict()

def normalize_attribute(attr):
    return attr.lower().strip()

# Function to get predictions and metrics for a specific K and threshold
def get_metrics(K, threshold):
    # Get search results
    D_faiss, I_faiss = index.search(sentence_embeddings, K)
    
    # Process results
    job_predictions = {}
    for i, row in enumerate(sentences_df.itertuples()):
        job_id = row.job_id
        
        if job_id not in job_predictions:
            job_predictions[job_id] = set()
        
        # Get labels with similarity above threshold
        for idx_pos, similarity in zip(I_faiss[i], D_faiss[i]):
            if idx_pos >= 0 and similarity > threshold:
                label = legacy_df['label'].iloc[idx_pos]
                job_predictions[job_id].add(label)
    
    # Calculate metrics
    precisions = []
    recalls = []
    f1s = []
    
    for job_id in ground_truth:
        gt = set(normalize_attribute(attr) for attr in ground_truth[job_id])
        
        pred = set(normalize_attribute(attr) for attr in job_predictions.get(job_id, []))
        tp = len(gt.intersection(pred))
        fp = len(pred - gt)
        fn = len(gt - pred)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1 = np.mean(f1s)
    
    return avg_precision, avg_recall, avg_f1

# Test all combinations
K_values = [30, 50, 80, 100, 120, 200]
thresholds = [0.9, 0.85, 0.8, 0.7, 0.5]

results = []

for K in K_values:
    print(f"\n=== Testing with K={K} ===")
    for threshold in thresholds:
        avg_precision, avg_recall, avg_f1 = get_metrics(K, threshold)
        results.append({
            'K': K,
            'Threshold': threshold,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        })
        print(f"K={K}, T={threshold}: Precision={avg_precision:.4f}, Recall={avg_recall:.4f}, F1={avg_f1:.4f}")

# Find the best configuration for recall
best_recall = -1
best_config = None

for result in results:
    if result['Recall'] > best_recall:
        best_recall = result['Recall']
        best_config = result

print("\n=== Best Configuration for Recall ===")
print(f"K={best_config['K']}, T={best_config['Threshold']}")
print(f"Precision={best_config['Precision']:.4f}, Recall={best_config['Recall']:.4f}, F1={best_config['F1']:.4f}")

# If the best recall is still below 0.9, suggest other approaches
if best_recall < 0.9:
    print("\nNone of the tested configurations achieved 0.9 recall.")
    print("Suggestions for improving recall:")
    print("1. Try even lower thresholds (e.g., 0.4, 0.3, 0.2)")
    print("2. Try higher K values (e.g., 300, 500, 1000)")
    print("3. Consider aggregating predictions from multiple threshold levels")
    print("4. Examine specific jobs with low recall to identify patterns")

Using GPU for SentenceTransformer: NVIDIA A10G

=== Testing with K=30 ===
K=30, T=0.9: Precision=0.2178, Recall=0.0390, F1=0.0594
K=30, T=0.85: Precision=0.1608, Recall=0.0795, F1=0.0930
K=30, T=0.8: Precision=0.1316, Recall=0.1260, F1=0.1114
K=30, T=0.7: Precision=0.0825, Recall=0.2354, F1=0.1127
K=30, T=0.5: Precision=0.0432, Recall=0.4123, F1=0.0749

=== Testing with K=50 ===
K=50, T=0.9: Precision=0.2168, Recall=0.0390, F1=0.0592
K=50, T=0.85: Precision=0.1588, Recall=0.0804, F1=0.0925
K=50, T=0.8: Precision=0.1288, Recall=0.1281, F1=0.1100
K=50, T=0.7: Precision=0.0782, Recall=0.2455, F1=0.1088
K=50, T=0.5: Precision=0.0347, Recall=0.4623, F1=0.0619

=== Testing with K=80 ===
K=80, T=0.9: Precision=0.2168, Recall=0.0390, F1=0.0592
K=80, T=0.85: Precision=0.1583, Recall=0.0805, F1=0.0922
K=80, T=0.8: Precision=0.1276, Recall=0.1289, F1=0.1092
K=80, T=0.7: Precision=0.0761, Recall=0.2502, F1=0.1067
K=80, T=0.5: Precision=0.0291, Recall=0.5040, F1=0.0527

=== Testing with K=100 ===
K