In [None]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
import warnings
import os
from collections import OrderedDict

warnings.filterwarnings('ignore')

# Set the device
device = torch.device('cuda:03' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==============================================================================
# 1. DEFINE THE MODEL ARCHITECTURE
# This must match the architecture of the saved model.
# ==============================================================================
class StyleContrastiveEncoder(nn.Module):
    def __init__(self,
                 base_model="microsoft/deberta-v3-base",
                 embedding_dim=256,
                 dropout=0.1):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model)
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        backbone_dim = self.backbone.config.hidden_size
        self.projection_head = nn.Sequential(
            nn.Linear(backbone_dim, backbone_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(backbone_dim // 2, embedding_dim),
            nn.LayerNorm(embedding_dim)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        # Use clamp to avoid division by zero for empty attention masks
        attention_weights = attention_mask.unsqueeze(-1).float()
        pooled = (hidden_states * attention_weights).sum(1) / attention_weights.sum(1).clamp(min=1e-9)
        style_embedding = self.projection_head(pooled)
        return F.normalize(style_embedding, p=2, dim=1)

# ==============================================================================
# 2. DEFINE THE DETECTOR CLASS FOR INFERENCE
# ==============================================================================
class StyleDetector:
    """An efficient class for loading the model and running batch predictions."""
    def __init__(self, model_path, centroids_path, device):
        self.device = device
        
        # Load model architecture
        self.model = StyleContrastiveEncoder().to(self.device)
        
        # Load the saved model weights
        checkpoint = torch.load(model_path, map_location=device)
        
        # Handle models saved with nn.DataParallel (which add a 'module.' prefix)
        state_dict = checkpoint.get('model_state_dict', checkpoint)
        if list(state_dict.keys())[0].startswith('module.'):
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            self.model.load_state_dict(new_state_dict)
        else:
            self.model.load_state_dict(state_dict)
            
        self.model.eval()
        print("Model loaded and set to evaluation mode.")
        
        # Load tokenizer
        self.tokenizer = self.model.tokenizer
        
        # Load style centroids
        centroids = torch.load(centroids_path, map_location=device)
        self.human_centroid = centroids['human_centroid'].to(self.device).unsqueeze(0)
        self.gpt4_centroid = centroids['gpt4_centroid'].to(self.device).unsqueeze(0)
        print("Style centroids loaded.")

    @torch.no_grad()
    def batch_predict(self, texts: list, batch_size: int = 32):
        """Predicts labels for a list of texts in batches."""
        all_predictions = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Benchmarking"):
            batch_texts = texts[i : i + batch_size]
            
            tokens = self.tokenizer(
                batch_texts,
                max_length=512,
                truncation=True,
                padding=True,
                return_tensors='pt'
            ).to(self.device)
            
            embeddings = self.model(tokens['input_ids'], tokens['attention_mask'])
            
            # Compare similarity to centroids
            human_sims = F.cosine_similarity(embeddings, self.human_centroid)
            gpt4_sims = F.cosine_similarity(embeddings, self.gpt4_centroid)
            
            # Prediction: 1 if closer to AI centroid, 0 if closer to human
            predictions = (gpt4_sims > human_sims).long()
            all_predictions.extend(predictions.cpu().tolist())
            
        return all_predictions

# ==============================================================================
# 3. DEFINE THE EVALUATION FUNCTION
# ==============================================================================
def evaluate_on_dataframe(detector: StyleDetector, df: pd.DataFrame):
    """Runs a full evaluation on a DataFrame and prints a detailed report."""
    print(f"\nStarting evaluation on {len(df)} samples...")
    
    # Prepare data
    texts_to_evaluate = df['text'].tolist()
    # Convert 'human'/'ai' labels to 0/1 for scikit-learn
    ground_truth_labels = df['models'].apply(lambda x: 0 if x.lower() == 'human' else 1).tolist()
    
    # Get predictions from the detector
    predictions = detector.batch_predict(texts_to_evaluate)
    
    # --- Calculate and Print Metrics ---
    print("\n" + "="*40)
    print("      Benchmark Evaluation Results")
    print("="*40)
    
    # Classification Report (Precision, Recall, F1-Score)
    print("\n--- Classification Report ---")
    print(classification_report(ground_truth_labels, predictions, target_names=['Human (Class 0)', 'AI (Class 1)']))
    
    # Overall Accuracy
    accuracy = accuracy_score(ground_truth_labels, predictions)
    print(f"Overall Accuracy: {accuracy:.4f}")
    
    # Confusion Matrix
    print("\n--- Confusion Matrix ---")
    cm = confusion_matrix(ground_truth_labels, predictions)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"{'':<15} | {'Predicted Human':<15} | {'Predicted AI':<15}")
    print("-" * 50)
    print(f"{'Actual Human':<15} | {tn:<15} | {fp:<15}")
    print(f"{'Actual AI':<15} | {fn:<15} | {tp:<15}")
    print("-" * 50)
    
    # False Positive Rate (FPR)
    if (fp + tn) > 0:
        fpr = fp / (fp + tn)
        print(f"\nFalse Positive Rate (FPR): {fpr:.4f} ({fpr:.2%})")
        print("(Percentage of human texts incorrectly flagged as AI)")
    else:
        print("\nFalse Positive Rate (FPR): N/A (No human samples in test set)")
        
    print("\n" + "="*40)

# ==============================================================================
# 4. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    # --- Load and Prepare Your Dataset ---
    df = pd.read_csv("/home/jivnesh/Harshit_Surge/dataset/sampled_train.csv")
    data = pd.DataFrame()
    data["models"] = df["model"].apply(lambda x: "ai" if x != "human" else "human")
    data['text'] = df['title'] + " " + df['generation']
    data.dropna(subset=['text'], inplace=True) # Ensure no null texts

    df_human = data[data.models == "human"]
    df_ai = data[data.models == "ai"]
    
    # Create balanced train/test splits
    train_human = df_human.sample(n=10000, random_state=42)
    train_ai = df_ai.sample(n=10000, random_state=42)
    test_human = df_human.drop(train_human.index).sample(n=2000, random_state=42)
    test_ai = df_ai.drop(train_ai.index).sample(n=2000, random_state=42)

    train_df = pd.concat([train_human, train_ai], axis=0)
    test_df = pd.concat([test_human, test_ai], axis=0)

    # Shuffle the final test dataframe
    test_df = shuffle(test_df, random_state=42).reset_index(drop=True)
    print(f"Test dataframe created with {len(test_df)} samples.")
    
    # --- Set Paths to Your Saved Model and Centroids ---
    MODEL_PATH = 'best_style_model.pt'
    CENTROIDS_PATH = 'centroids.pt'

    # Check if files exist before proceeding
    if not os.path.exists(MODEL_PATH) or not os.path.exists(CENTROIDS_PATH):
        print(f"Error: Make sure '{MODEL_PATH}' and '{CENTROIDS_PATH}' are in the correct directory.")
    else:
        # --- Initialize the Detector and Run Evaluation ---
        detector = StyleDetector(model_path=MODEL_PATH, centroids_path=CENTROIDS_PATH, device=device)
        evaluate_on_dataframe(detector, test_df)



  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Test dataframe created with 4000 samples.


2025-07-15 05:47:47.838499: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-15 05:47:48.033917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752526068.127124 2870596 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752526068.143688 2870596 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752526068.395860 2870596 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Model loaded and set to evaluation mode.
Style centroids loaded.

Starting evaluation on 4000 samples...


Benchmarking: 100%|██████████| 125/125 [01:17<00:00,  1.61it/s]


      Benchmark Evaluation Results

--- Classification Report ---
                 precision    recall  f1-score   support

Human (Class 0)       0.94      0.99      0.97      2000
   AI (Class 1)       0.99      0.94      0.96      2000

       accuracy                           0.96      4000
      macro avg       0.97      0.96      0.96      4000
   weighted avg       0.97      0.96      0.96      4000

Overall Accuracy: 0.9645

--- Confusion Matrix ---
                | Predicted Human | Predicted AI   
--------------------------------------------------
Actual Human    | 1985            | 15             
Actual AI       | 127             | 1873           
--------------------------------------------------

False Positive Rate (FPR): 0.0075 (0.75%)
(Percentage of human texts incorrectly flagged as AI)






In [1]:
import os
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Union, List
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

# --- Pre-computation Setup & Warnings ---
torch.set_grad_enabled(False)

# --- Metric Functions (Unchanged) ---
ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)

def perplexity(encoding: transformers.BatchEncoding, logits: torch.Tensor, median: bool = False, temperature: float = 1.0):
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()
    if median:
        ce_nan = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).masked_fill(~shifted_attention_mask.bool(), float("nan")))
        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        ppl = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels) * shifted_attention_mask).sum(1) / shifted_attention_mask.sum(1)
        ppl = ppl.to("cpu").float().numpy()
    return ppl

def entropy(p_logits: torch.Tensor, q_logits: torch.Tensor, encoding: transformers.BatchEncoding, pad_token_id: int, median: bool = False, sample_p: bool = False, temperature: float = 1.0):
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]
    p_scores, q_scores = p_logits / temperature, q_logits / temperature
    p_proba = softmax_fn(p_scores).view(-1, vocab_size)
    if sample_p:
        p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)
    q_scores = q_scores.view(-1, vocab_size)
    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)
    if median:
        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        agg_ce = (((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy())
    return agg_ce

# --- Binoculars Classifier (MODIFIED to use Gemma) ---
huggingface_config = {"TOKEN": os.environ.get("HF_TOKEN", None)}
# NOTE: The threshold is based on the original Falcon models.
# Performance may vary with Gemma, but we use the "accuracy" mode threshold.
BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843
DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1

class Binoculars(object):
    def __init__(self, observer_name_or_path: str = "tiiuae/falcon-7b", performer_name_or_path: str = "tiiuae/falcon-7b-instruct", use_bfloat16: bool = True, max_token_observed: int = 512) -> None:
        print("Initializing Binoculars with Gemma-2-9B models...")
        self.threshold = BINOCULARS_ACCURACY_THRESHOLD
        print(f"Loading observer model: {observer_name_or_path} onto {DEVICE_1}")
        self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path, device_map={"": DEVICE_1}, trust_remote_code=True, torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32, token=huggingface_config["TOKEN"])
        print(f"Loading performer model: {performer_name_or_path} onto {DEVICE_2}")
        self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path, device_map={"": DEVICE_2}, trust_remote_code=True, torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32, token=huggingface_config["TOKEN"])
        self.observer_model.eval(); self.performer_model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path, token=huggingface_config["TOKEN"])
        if not self.tokenizer.pad_token: self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_token_observed = max_token_observed
        print("Binoculars initialized successfully.")
    def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
        return self.tokenizer(batch, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_token_observed, return_token_type_ids=False)
    @torch.inference_mode()
    def _get_logits(self, encodings: transformers.BatchEncoding) -> tuple[torch.Tensor, torch.Tensor]:
        observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
        performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
        if torch.cuda.is_available() and DEVICE_1 != "cpu": torch.cuda.synchronize()
        return observer_logits, performer_logits
    def compute_score(self, input_text: Union[str, List[str]]) -> Union[float, List[float]]:
        batch = [input_text] if isinstance(input_text, str) else input_text
        encodings = self._tokenize(batch)
        observer_logits, performer_logits = self._get_logits(encodings)
        ppl_val = perplexity(encodings.to(DEVICE_2), performer_logits)
        x_ppl_val = entropy(observer_logits.to(DEVICE_1), performer_logits.to(DEVICE_1), encodings.to(DEVICE_1), self.tokenizer.pad_token_id)
        binoculars_scores = ppl_val / x_ppl_val
        return binoculars_scores.tolist()[0] if isinstance(input_text, str) else binoculars_scores.tolist()

# --- Evaluation Function (MODIFIED to add full stats) ---
def evaluate_on_dataframe(df: pd.DataFrame, batch_size: int = 8):
    print("\n--- Starting Binoculars Evaluation ---")
    try:
        binoculars = Binoculars()
    except Exception as e:
        print(f"\n--- ERROR ---"); print(f"Failed to initialize Binoculars classifier: {e}"); return

    text_samples = df["text"].tolist()
    true_labels = df["models"].tolist()
    all_predictions, all_scores = [], []

    print(f"\nRunning predictions on {len(text_samples)} samples in batches of {batch_size}...")
    for i in tqdm(range(0, len(text_samples), batch_size), desc="Processing Batches"):
        batch_texts = text_samples[i:i + batch_size]
        batch_scores = binoculars.compute_score(batch_texts)
        batch_predictions = np.where(np.array(batch_scores) < binoculars.threshold, "AI-generated", "Human-generated").tolist()
        all_predictions.extend(batch_predictions); all_scores.extend(batch_scores)
    
    results_df = pd.DataFrame({'true_label': ["AI-generated" if l == 'ai' else "Human-generated" for l in true_labels], 'predicted_label': all_predictions, 'binoculars_score': all_scores})
    
    # Prepare labels for sklearn metrics
    y_true_str = results_df['true_label']
    y_pred_str = results_df['predicted_label']
    y_true_bin = (y_true_str == 'AI-generated').astype(int)
    y_pred_bin = (y_pred_str == 'AI-generated').astype(int)
    y_scores = -results_df['binoculars_score'].values
    
    print("\n" + "="*50)
    print("      Binoculars Classification Statistics")
    print("="*50)
    
    # 1. Classification Report (Precision, Recall, F1-Score)
    print("\n--- Classification Report ---")
    print(classification_report(y_true_str, y_pred_str, target_names=['Human-generated', 'AI-generated']))

    # 2. Overall Accuracy
    accuracy = accuracy_score(y_true_str, y_pred_str)
    print(f"\nOverall Accuracy: {accuracy:.4f}")

    # 3. ROC AUC Score
    try:
        auc_score = roc_auc_score(y_true_bin, y_scores)
        print(f"ROC AUC Score: {auc_score:.4f}")
    except ValueError as e:
        print(f"Could not calculate ROC AUC Score: {e}")

    # 4. Confusion Matrix and FPR
    try:
        cm = confusion_matrix(y_true_bin, y_pred_bin)
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
        
        print("\n--- Confusion Matrix ---")
        print(f"{'':<15} | {'Predicted Human':<15} | {'Predicted AI':<15}")
        print("-" * 50)
        print(f"{'Actual Human':<15} | {tn:<15} | {fp:<15}")
        print(f"{'Actual AI':<15} | {fn:<15} | {tp:<15}")
        print("-" * 50)

        print(f"\nFalse Positive Rate (FPR): {fpr:.4f} (Human text incorrectly flagged as AI)")
        
    except ValueError as e:
        print(f"Could not calculate Confusion Matrix or FPR: {e}")
    print("\n" + "="*50)

# --- Main Execution ---
print("Loading and preparing the dataset...")
try:
    df = pd.read_csv("/home/jivnesh/Harshit_Surge/dataset/sampled_train.csv")
except FileNotFoundError:
    print("Error: File '/home/jivnesh/Harshit_Surge/dataset/sampled_train.csv' not found.")
    exit()

data = pd.DataFrame()
data["models"] = df["model"].apply(lambda x: "ai" if x != "human" else "human")
data['text'] = df['title'] + " " + df['generation']
data.dropna(subset=['text'], inplace=True)

df_human = data[data.models == "human"]
df_ai = data[data.models == "ai"]

train_human = df_human.sample(n=10000, random_state=42)
train_ai = df_ai.sample(n=10000, random_state=42)
test_human = df_human.drop(train_human.index).sample(n=2000, random_state=42)
test_ai = df_ai.drop(train_ai.index).sample(n=2000, random_state=42)

test_df = pd.concat([test_human, test_ai], axis=0)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

print("-" * 50)
print(f"Test DataFrame created for benchmarking.")
print(f"Total samples: {len(test_df)}")
print("-" * 50)

# Run the evaluation
evaluate_on_dataframe(test_df, batch_size=8) # Smaller batch size for larger models

  from .autonotebook import tqdm as notebook_tqdm


Loading and preparing the dataset...
--------------------------------------------------
Test DataFrame created for benchmarking.
Total samples: 4000
--------------------------------------------------

--- Starting Binoculars Evaluation ---
Initializing Binoculars with Gemma-2-9B models...
Loading observer model: tiiuae/falcon-7b onto cuda:0




2025-07-15 06:54:03.398055: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-15 06:54:03.427546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752530043.452627 2908091 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752530043.460137 2908091 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752530043.479395 2908091 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

Loading performer model: tiiuae/falcon-7b-instruct onto cuda:1




Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.06s/it]


Binoculars initialized successfully.

Running predictions on 4000 samples in batches of 8...


Processing Batches: 100%|██████████| 500/500 [2:53:53<00:00, 20.87s/it]  


      Binoculars Classification Statistics

--- Classification Report ---
                 precision    recall  f1-score   support

Human-generated       0.88      0.80      0.84      2000
   AI-generated       0.81      0.89      0.85      2000

       accuracy                           0.84      4000
      macro avg       0.85      0.84      0.84      4000
   weighted avg       0.85      0.84      0.84      4000


Overall Accuracy: 0.8440
ROC AUC Score: 0.9295

--- Confusion Matrix ---
                | Predicted Human | Predicted AI   
--------------------------------------------------
Actual Human    | 1784            | 216            
Actual AI       | 408             | 1592           
--------------------------------------------------

False Positive Rate (FPR): 0.1080 (Human text incorrectly flagged as AI)






In [None]:
import os
import time
import torch
import pandas as pd
import numpy as np
from scipy.stats import norm
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from transformers import AutoModelForCausalLM, AutoTokenizer

# A dictionary to map short names to Hugging Face model identifiers
model_fullnames = {
    'gemma3-4b': 'google/gemma-3-4b-it', # Use instruct-tuned version for better performance
}

def get_model_fullname(model_name):
    return model_fullnames.get(model_name, model_name)

def load_model(model_name, device, cache_dir, quantization=None):
    model_fullname = get_model_fullname(model_name)
    print(f'Loading model {model_fullname}...')
    model_kwargs = {"cache_dir": cache_dir}
    print("-> Loading model in bfloat16 (half-precision)...")
    model_kwargs["torch_dtype"] = torch.bfloat16
    model_kwargs["device_map"] = "auto"
    model = AutoModelForCausalLM.from_pretrained(model_fullname, **model_kwargs)
    model.eval()
    return model

def load_tokenizer(model_name, cache_dir):
    model_fullname = get_model_fullname(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_fullname, cache_dir=cache_dir)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

def get_sampling_discrepancy_analytic(logits_ref, logits_score, labels):
    if logits_ref.size(-1) != logits_score.size(-1):
        vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
        logits_ref = logits_ref[:, :, :vocab_size]
        logits_score = logits_score[:, :, :vocab_size]
    labels = labels.unsqueeze(-1) if labels.ndim == logits_score.ndim - 1 else labels
    lprobs_score = torch.log_softmax(logits_score, dim=-1)
    probs_ref = torch.softmax(logits_ref, dim=-1)
    log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
    mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
    var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
    log_likelihood_sum = log_likelihood.sum(dim=-1)
    mean_ref_sum = mean_ref.sum(dim=-1)
    var_ref_sum = var_ref.sum(dim=-1)
    denominator = torch.sqrt(torch.relu(var_ref_sum)) + 1e-6
    discrepancy = (log_likelihood_sum - mean_ref_sum) / denominator
    return discrepancy.item()

def compute_prob_norm(x, mu0, sigma0, mu1, sigma1):
    pdf_value0 = norm.pdf(x, loc=mu0, scale=sigma0)
    pdf_value1 = norm.pdf(x, loc=mu1, scale=sigma1)
    prob = pdf_value1 / (pdf_value0 + pdf_value1 + 1e-6)
    return prob

class FastDetectGPTDetector:
    def __init__(self, scoring_model_name, sampling_model_name, device, cache_dir, quantization):
        self.scoring_model_name = scoring_model_name
        self.sampling_model_name = sampling_model_name
        self.scoring_tokenizer = load_tokenizer(scoring_model_name, cache_dir)
        self.scoring_model = load_model(scoring_model_name, device, cache_dir, quantization)
        if sampling_model_name == scoring_model_name:
            self.sampling_model = self.scoring_model
            self.sampling_tokenizer = self.scoring_tokenizer
        else:
            self.sampling_tokenizer = load_tokenizer(sampling_model_name, cache_dir)
            self.sampling_model = load_model(sampling_model_name, device, cache_dir, quantization)
        # Using pre-calibrated parameters
        self.classifier_params = {'mu0': -0.0707, 'sigma0': 0.9520, 'mu1': 2.9306, 'sigma1': 1.9039}

    def compute_prob(self, text):
        tokenized_score = self.scoring_tokenizer(text, truncation=True, return_tensors="pt", max_length=1024)
        labels = tokenized_score.input_ids[:, 1:].to(self.scoring_model.device)
        if labels.shape[1] == 0:
            return 0.0
        with torch.no_grad():
            inputs_score = {k: v.to(self.scoring_model.device) for k, v in tokenized_score.items()}
            logits_score = self.scoring_model(**inputs_score).logits[:, :-1]
            if self.sampling_model_name == self.scoring_model_name:
                logits_ref = logits_score
            else:
                tokenized_ref = self.sampling_tokenizer(text, truncation=True, return_tensors="pt", max_length=1024)
                inputs_ref = {k: v.to(self.sampling_model.device) for k, v in tokenized_ref.items()}
                logits_ref = self.sampling_model(**inputs_ref).logits[:, :-1]
        crit = get_sampling_discrepancy_analytic(logits_ref, logits_score, labels)
        prob = compute_prob_norm(crit, **self.classifier_params)
        return prob

# --- Script Configuration ---
SCORING_MODEL_NAME = "gemma3-4b"
SAMPLING_MODEL_NAME = "gemma3-4b" # Use the same model for simplicity
DEVICE = "cuda:03" if torch.cuda.is_available() else "cpu"
CACHE_DIR = "./model_cache"
OUTPUT_FILE = "fastdetectgpt_gemma_results.csv"

# --- Main Execution ---
print("--- Initializing Fast-DetectGPT Detector ---")
detector = FastDetectGPTDetector(
    scoring_model_name=SCORING_MODEL_NAME,
    sampling_model_name=SAMPLING_MODEL_NAME,
    device=DEVICE,
    cache_dir=CACHE_DIR,
    quantization=None
)

print("\n--- Loading and preparing dataset ---")
try:
    df = pd.read_csv("/home/jivnesh/Harshit_Surge/dataset/sampled_train.csv")
except FileNotFoundError:
    print("Error: File '/home/jivnesh/Harshit_Surge/dataset/sampled_train.csv' not found.")
    exit()

data = pd.DataFrame()
data["models"] = df["model"].apply(lambda x: "ai" if x != "human" else "human")
data['text'] = df['title'] + " " + df['generation']
data.dropna(subset=['text'], inplace=True)

df_human = data[data.models == "human"]
df_ai = data[data.models == "ai"]

train_human = df_human.sample(n=10000, random_state=42)
train_ai = df_ai.sample(n=10000, random_state=42)
test_human = df_human.drop(train_human.index).sample(n=2000, random_state=42)
test_ai = df_ai.drop(train_ai.index).sample(n=2000, random_state=42)

test_df = pd.concat([test_human, test_ai], axis=0)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

print(f"Test DataFrame created with {len(test_df)} samples.")
print("-" * 50)

print(f"\n--- Running detection on {len(test_df)} samples ---")
all_probs = []
true_labels = []

for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing samples"):
    try:
        prob = detector.compute_prob(row['text'])
        all_probs.append(prob)
        true_labels.append(0 if row['models'] == 'human' else 1)
    except Exception as e:
        print(f"Error processing sample (index {index}): {e}. Skipping.")
        continue

# --- Evaluating Results ---
print("\n" + "="*50)
print("      Fast-DetectGPT Classification Statistics")
print("="*50)

if len(all_probs) > 0 and len(set(true_labels)) > 1:
    # Convert probabilities to binary predictions for classification report
    binary_predictions = [1 if p > 0.5 else 0 for p in all_probs]

    # 1. Classification Report (Precision, Recall, F1-Score)
    print("\n--- Classification Report ---")
    print(classification_report(true_labels, binary_predictions, target_names=['Human', 'AI']))

    # 2. Overall Accuracy
    accuracy = accuracy_score(true_labels, binary_predictions)
    print(f"\nOverall Accuracy: {accuracy:.4f}")

    # 3. ROC AUC Score
    roc_auc = roc_auc_score(true_labels, all_probs)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # 4. Confusion Matrix and FPR
    cm = confusion_matrix(true_labels, binary_predictions)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    print("\n--- Confusion Matrix ---")
    print(f"{'':<15} | {'Predicted Human':<15} | {'Predicted AI':<15}")
    print("-" * 50)
    print(f"{'Actual Human':<15} | {tn:<15} | {fp:<15}")
    print(f"{'Actual AI':<15} | {fn:<15} | {tp:<15}")
    print("-" * 50)
    print(f"\nFalse Positive Rate (FPR): {fpr:.4f} (Human text incorrectly flagged as AI)")
    print("\n" + "="*50)

    results_df = pd.DataFrame({'true_label': true_labels, 'predicted_prob_ai': all_probs})
    results_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Detailed results saved to {OUTPUT_FILE}")
else:
    print(f"Could not compute metrics. Processed {len(all_probs)} samples.")