In [2]:
import datasets
from functools import partial
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split


In [3]:
def KM(ds, target_col):
    
    total_match = 0
    for sample in ds:
        corr_ans = sample['ground_truth']

        is_corr = 0

        for ref in corr_ans:
            if ref in sample[target_col]:
                is_corr= 1
        
        total_match += is_corr

    return total_match / len(ds)

def add_KM(sample, target_col, gt_col):
    corr_ans = sample[gt_col]

    is_corr = 0

    for ref in corr_ans:
        if ref in sample[target_col]:
            is_corr= 1
        
    sample[f'KM_{target_col}'] = is_corr
    
    return sample

def add_self_rag_retrieval(sample):
    sample['self_rag_need_retrieval'] = '[Retrieval]' in sample['self_rag_response']
    return sample

def need_context(sample, with_context_col, without_context_col, need_context_col):

    if sample[f'KM_{with_context_col}'] > sample[f'KM_{without_context_col}']:
        need_context = 1
    else:
        need_context = 0

    sample[need_context_col] = need_context
    return sample    

In [4]:
ds = datasets.load_from_disk('../data/datasets/nq')

In [6]:
ds['Verbalized1S']

[nan,
 0.6,
 0.30000000000000004,
 0.30000000000000004,
 0.30000000000000004,
 nan,
 0.5,
 nan,
 nan,
 nan,
 nan,
 0.050000000000000044,
 0.19999999999999996,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.30000000000000004,
 nan,
 0.30000000000000004,
 0.30000000000000004,
 0.15000000000000002,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.19999999999999996,
 nan,
 nan,
 nan,
 0.8,
 nan,
 nan,
 0.19999999999999996,
 0.19999999999999996,
 nan,
 nan,
 nan,
 0.30000000000000004,
 nan,
 0.30000000000000004,
 0.15000000000000002,
 nan,
 nan,
 nan,
 0.8,
 nan,
 0.6,
 nan,
 nan,
 0.19999999999999996,
 0.30000000000000004,
 nan,
 0.19999999999999996,
 nan,
 nan,
 0.15000000000000002,
 nan,
 0.6,
 nan,
 nan,
 0.30000000000000004,
 nan,
 0.30000000000000004,
 nan,
 nan,
 0.19999999999999996,
 0.19999999999999996,
 0.25,
 nan,
 0.19999999999999996,
 0.19999999999999996,
 nan,
 0.19999999999999996,
 nan,
 nan,
 0.050000000000000044,
 0.19999999999999996,
 0.6,
 nan,
 nan,
 nan,
 nan,
 0.19999999999999996,

In [8]:
ds = ds.map(partial(add_KM, target_col='all_context_response', gt_col='ground_truth'))
ds = ds.map(partial(add_KM, target_col='no_context_response', gt_col='ground_truth'))

In [None]:
ds = ds.map(partial(add_KM, target_col='all_context_response', gt_col='ground_truth'))


In [9]:
ds = ds.map(partial(need_context, with_context_col='all_context_response', without_context_col='no_context_response',
                    need_context_col='gt_need_retrieval'))

In [10]:
y = np.array(ds['gt_need_retrieval'])

In [11]:
y.mean()

0.35668549905838043

In [22]:
data = pd.DataFrame({
    'Perplexity': ds['Perplexity'],
    'MeanTokenEntropy': ds['MeanTokenEntropy'],
    'PTrue': ds['PTrue'],
   # 'Verbalized1S': ds['Verbalized1S']
})

# Assuming y is a 1D numpy array
y = np.array(y)


In [36]:
for col in data.columns:
    print(f"Evaluating feature: {col}")

    # Step 1: Correlation
    corr, _ = pearsonr(data[col], y)
    print(f"Correlation with y: {corr:.3f}")
    
    # Step 2: Logistic regression to predict y from the single feature
    X = data[[col]].values  # Use only the current feature
    
    # Initialize the logistic regression model
    log_reg = LogisticRegression()
    
    # Fit the logistic regression model
    log_reg.fit(X, y)
    
    # Make predictions
    y_pred = log_reg.predict(X)
    y_pred_prob = log_reg.predict_proba(X)[:, 1]  # For ROC AUC
    
    # Step 3: Evaluate accuracy and ROC AUC
    accuracy = accuracy_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred_prob)
    
    print(f"Accuracy: {accuracy:.3f}")
    print(f"ROC AUC: {roc_auc:.3f}")
    
    # Step 4: Draw confusion matrix
    conf_matrix = confusion_matrix(y, y_pred)
    print(conf_matrix)
    # disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
    # disp.plot()
   # plt.title(f"Confusion Matrix for {col}")
   # plt.show()
    
    print("\n" + "="*50 + "\n")

Evaluating feature: Perplexity
Correlation with y: 0.149
Accuracy: 0.644
ROC AUC: 0.598
[[1685   23]
 [ 921   26]]


Evaluating feature: MeanTokenEntropy
Correlation with y: 0.182
Accuracy: 0.639
ROC AUC: 0.623
[[1614   94]
 [ 864   83]]


Evaluating feature: PTrue
Correlation with y: 0.003
Accuracy: 0.643
ROC AUC: 0.493
[[1708    0]
 [ 947    0]]




In [7]:
y.mean()

NameError: name 'y' is not defined

In [34]:
X = data.values  # Features matrix

# Initialize the logistic regression model
log_reg = LogisticRegression(class_weight='balanced')

# Fit the logistic regression model
log_reg.fit(X, y)

# Make predictions
y_pred = log_reg.predict(X)
y_pred_prob = log_reg.predict_proba(X)[:, 1]  # For ROC AUC

# Step 3: Evaluate accuracy, ROC AUC
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_pred_prob)

print(f"Accuracy: {accuracy:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

Accuracy: 0.608
ROC AUC: 0.625


In [30]:
len(ds)

2655

In [33]:
num_train = 600
X_train, X_val, y_train, y_val = train_test_split(data, y, train_size=num_train, stratify=y, random_state=42)

# Function to find the best threshold that maximizes ROC AUC
def find_best_threshold(column, y_train):
    best_threshold = 0
    best_roc_auc = 0
    thresholds = np.linspace(column.min(), column.max(), 100)
    
    for threshold in thresholds:
        # Predicting based on threshold
        y_pred_train = (column >= threshold).astype(int)
        
        # Calculate ROC AUC for this threshold
        roc_auc = roc_auc_score(y_train, y_pred_train)
        
        # Update the best threshold if this is better
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_threshold = threshold
    
    return best_threshold, best_roc_auc

# Step 2: For each feature, find the threshold that maximizes ROC AUC on training set
results = {}

for col in X_train.columns:
    print(f"Evaluating feature: {col}")
    
    # Find the best threshold for this feature on the training set
    best_threshold, best_roc_auc_train = find_best_threshold(X_train[col], y_train)
    print(f"Best threshold on train set for {col}: {best_threshold:.3f} with ROC AUC: {best_roc_auc_train:.3f}")
    
    # Step 3: Evaluate on the validation set using the found threshold
    y_pred_val = (X_val[col] >= best_threshold).astype(int)
    roc_auc_val = roc_auc_score(y_val, y_pred_val)
    print(f"ROC AUC on validation set for {col}: {roc_auc_val:.3f}")
    
    # Store results
    results[col] = {
        'best_threshold': best_threshold,
        'train_roc_auc': best_roc_auc_train,
        'validation_roc_auc': roc_auc_val
    }
    print("="*50)

# Displaying the summary of results for each feature
print("Summary of results:")
for col, metrics in results.items():
    print(f"Feature: {col}")
    print(f"  Best Threshold: {metrics['best_threshold']:.3f}")
    print(f"  Train ROC AUC: {metrics['train_roc_auc']:.3f}")
    print(f"  Validation ROC AUC: {metrics['validation_roc_auc']:.3f}")
    print("="*50)

Evaluating feature: Perplexity
Best threshold on train set for Perplexity: 0.156 with ROC AUC: 0.592
ROC AUC on validation set for Perplexity: 0.580
Evaluating feature: MeanTokenEntropy
Best threshold on train set for MeanTokenEntropy: 0.311 with ROC AUC: 0.587
ROC AUC on validation set for MeanTokenEntropy: 0.597
Evaluating feature: PTrue
Best threshold on train set for PTrue: 35.717 with ROC AUC: 0.503
ROC AUC on validation set for PTrue: 0.500
Summary of results:
Feature: Perplexity
  Best Threshold: 0.156
  Train ROC AUC: 0.592
  Validation ROC AUC: 0.580
Feature: MeanTokenEntropy
  Best Threshold: 0.311
  Train ROC AUC: 0.587
  Validation ROC AUC: 0.597
Feature: PTrue
  Best Threshold: 35.717
  Train ROC AUC: 0.503
  Validation ROC AUC: 0.500
