In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import os
import time
import random

# ======================
# CONSTANTS
# ======================
PREDICTOR1 = 'voicing'       # First predictor column name
PREDICTOR2 = 'duration'      # Second predictor column name
TARGET = 'answer'            # Target column name
FILENAME_COL = 'filename'    # Filename column name
LABEL_MAPPING = {'s': 0, 'z': 1}  # Update with your actual label mapping

INIT_RANDOM_SAMPLES = 10     # Initial random samples to collect
MIN_ITERATIONS = 20          # Minimum number of iterations
MODEL_CERTAINTY_CUTOFF = 0.95 # Stopping certainty threshold
PARTICIPANT_TO_MODEL = 'p03' # Participant ID to simulate


DATA_PATH = 'data/data.csv'                         # Main data file path
PARTICIPANT_CSV_DIR = 'data/participants' # Participant CSVs directory

# ======================
# DATA LOADING
# ======================
# Load main experiment data
data = pd.read_csv(DATA_PATH)

# Initialize tracking columns if they don't exist
if 'answered' not in data.columns:
    data['answered'] = False
if TARGET not in data.columns:
    data[TARGET] = np.nan

# ======================
# VIRTUAL AGENT FUNCTION
# ======================
def virtual_agent(filename, participant):
    """
    Simulates human participant responses using pre-recorded answers
    Returns answer from participant's CSV for given filename
    """
    # Load participant's answer lookup table
    participant_path = os.path.join(PARTICIPANT_CSV_DIR, f"{participant}.csv")
    participant_data = pd.read_csv(participant_path)
    
    # Find matching row and return answer
    match = participant_data[participant_data[FILENAME_COL] == filename]
    if not match.empty:
        raw_answer = match['answer_batch'].values[0]
        return LABEL_MAPPING[raw_answer]  # Convert to numerical
    else:
        raise ValueError(f"Filename {filename} not found in {participant}'s data")

# ======================
# ACTIVE LEARNING SYSTEM
# ======================
def calculate_uncertainty(probs):
    """Calculate uncertainty as distance from decision boundary"""
    return 1 - np.maximum(probs, 1 - probs)

def plot_results(answered_data, unanswered_data, model):
    """Visualize results with decision boundary"""
    plt.figure(figsize=(10, 6))
    
    # Convert boolean/categorical answers to numerical if needed
    if answered_data[TARGET].dtype == 'object':
        answered_data = answered_data.copy()
        answered_data[TARGET] = answered_data[TARGET].map({'s': 0, 'z': 1})  # Update with your actual labels
        
    # Plot answered points with proper color mapping
    if not answered_data.empty:
        scatter = plt.scatter(
            answered_data[PREDICTOR1], 
            answered_data[PREDICTOR2], 
            c=answered_data[TARGET], 
            cmap='coolwarm', 
            label='Answered', 
            edgecolors='k',
            vmin=0,  # Ensure color scale matches binary classification
            vmax=1
        )
        
    # Plot unanswered points if any remain
    if not unanswered_data.empty:
        plt.scatter(
            unanswered_data[PREDICTOR1], 
            unanswered_data[PREDICTOR2],
            c='gray', 
            alpha=0.5, 
            label='Unanswered'
        )
    
    # Create decision boundary grid with proper feature names
    x_min, x_max = data[PREDICTOR1].min() - 1, data[PREDICTOR1].max() + 1
    y_min, y_max = data[PREDICTOR2].min() - 1, data[PREDICTOR2].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Create DataFrame with proper feature names
    grid_points = pd.DataFrame(
        np.c_[xx.ravel(), yy.ravel()],
        columns=[PREDICTOR1, PREDICTOR2]
    )
    
    Z = model.predict_proba(grid_points)[:, 1]
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, levels=20, cmap='coolwarm')
    plt.colorbar(scatter, label='Predicted Probability')
    plt.xlabel(PREDICTOR1)
    plt.ylabel(PREDICTOR2)
    plt.title('Active Learning Results with Decision Boundary')
    plt.legend()
    plt.show()

# ======================
# EXPERIMENT EXECUTION
# ======================
start_time = time.time()

# Phase 1: Initial random sampling
print("Starting initial random sampling...")

def collect_sample():
    """Helper function to collect a random sample"""
    unanswered = data[~data['answered']]
    if unanswered.empty:
        return False
    
    random_sample = unanswered.sample(1)
    filename = random_sample[FILENAME_COL].values[0]
    
    # Get virtual agent response
    answer = virtual_agent(filename, PARTICIPANT_TO_MODEL)
    
    # Update dataframe
    data.loc[data[FILENAME_COL] == filename, TARGET] = answer
    data.loc[data[FILENAME_COL] == filename, 'answered'] = True
    return True

# 1. Collect minimum initial samples
samples_collected = 0
while samples_collected < INIT_RANDOM_SAMPLES:
    if not collect_sample():
        break  # No more samples available
    samples_collected += 1

# 2. Ensure class diversity
answered_data = data[data['answered']]
unique_classes = answered_data[TARGET].dropna().unique()

# Continue sampling until we get at least 2 classes or run out of samples
while len(unique_classes) < 2 and not data[~data['answered']].empty:
    if not collect_sample():
        break
    answered_data = data[data['answered']]
    unique_classes = answered_data[TARGET].dropna().unique()

# 3. Check if we succeeded
if len(unique_classes) < 2:
    print("WARNING: Only one class present after initial sampling!")
    print("Cannot train model - exiting.")
    exit()

# Phase 2: Active learning with uncertainty sampling
print("\nStarting active learning phase...")
iteration = INIT_RANDOM_SAMPLES

while True:
    # Get training data
    answered_data = data[data['answered']]
    X_train = answered_data[[PREDICTOR1, PREDICTOR2]]
    y_train = answered_data[TARGET]
    
    # Check if we have enough samples to train
    if len(y_train) < 2:
        print("Not enough samples to train model.")
        break
    
    # Train logistic regression
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Get unlabeled samples
    unanswered_data = data[~data['answered']]
    if unanswered_data.empty:
        print("All samples have been answered.")
        break
    
    # Calculate uncertainties
    X_unanswered = unanswered_data[[PREDICTOR1, PREDICTOR2]].copy()
    X_unanswered.columns = [PREDICTOR1, PREDICTOR2]  # Explicit column naming to prevent error
    probs = model.predict_proba(X_unanswered)[:, 1]
    uncertainties = calculate_uncertainty(probs)
    
    # Check if remaining samples meet certainty criteria
    if np.all(uncertainties <= (1 - MODEL_CERTAINTY_CUTOFF)) and iteration >= MIN_ITERATIONS:
        print(f"All remaining samples meet certainty threshold {MODEL_CERTAINTY_CUTOFF} after {iteration} iterations.")
        break
    
    # Select most uncertain sample
    max_uncertainty = uncertainties.max()
    candidates = unanswered_data[uncertainties == max_uncertainty]
    selected_sample = candidates.sample(1)
    
    # Get virtual agent response
    filename = selected_sample[FILENAME_COL].values[0]
    answer = virtual_agent(filename, PARTICIPANT_TO_MODEL)
    
    # Update dataframe
    data.loc[data[FILENAME_COL] == filename, TARGET] = answer
    data.loc[data[FILENAME_COL] == filename, 'answered'] = True
    
    iteration += 1
    print(f"Iteration {iteration}: Selected {filename} with uncertainty {max_uncertainty:.3f}")

# ======================
# UPDATED FINAL RESULTS
# ======================
runtime = time.time() - start_time
answered_count = data['answered'].sum()
total_samples = len(data)

print("\n=== Experiment Results ===")
print(f"Total runtime: {runtime:.2f} seconds")
print(f"Answered samples: {answered_count}/{total_samples}")

# Load TRUE answers from participant's lookup table
participant_path = os.path.join(PARTICIPANT_CSV_DIR, f"{PARTICIPANT_TO_MODEL}.csv")
true_answers = pd.read_csv(participant_path)[[FILENAME_COL, 'answer_batch']]
true_answers = true_answers.rename(columns={'answer_batch': 'true_answer'})

# Merge true answers with our data
data = data.merge(true_answers, on=FILENAME_COL, how='left')
data['true_answer'] = data['true_answer'].map(LABEL_MAPPING)  # Convert to numerical

if answered_count > 0:
    # Train final model
    final_model = LogisticRegression()
    X_final = data[data['answered']][[PREDICTOR1, PREDICTOR2]]
    y_final = data[data['answered']][TARGET]
    final_model.fit(X_final, y_final)
    
    # Add predictions to dataframe
    data['prediction'] = final_model.predict(data[[PREDICTOR1, PREDICTOR2]])
    data['certainty'] = np.max(final_model.predict_proba(data[[PREDICTOR1, PREDICTOR2]]), axis=1)
    
    # Calculate performance metrics
    print("\n=== Model Performance ===")
    
    # 1. On answered samples (human-labeled)
    y_true_answered = data[data['answered']]['true_answer']
    y_pred_answered = data[data['answered']]['prediction']
    
    print("\n[Answered Samples]")
    print(f"Accuracy:  {accuracy_score(y_true_answered, y_pred_answered):.3f}")
    print(f"Precision: {precision_score(y_true_answered, y_pred_answered):.3f}")
    print(f"Recall:    {recall_score(y_true_answered, y_pred_answered):.3f}")
    print(f"F1 Score:  {f1_score(y_true_answered, y_pred_answered):.3f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true_answered, y_pred_answered))
    
    # 2. On all samples (model predictions vs ground truth)
    y_true_all = data['true_answer']
    y_pred_all = data['prediction']
    
    print("\n[All Samples vs Lookup Table]")
    print(f"Overall Accuracy:  {accuracy_score(y_true_all, y_pred_all):.3f}")
    print(f"Precision: {precision_score(y_true_all, y_pred_all):.3f}")
    print(f"Recall:    {recall_score(y_true_all, y_pred_all):.3f}")
    print(f"F1 Score:  {f1_score(y_true_all, y_pred_all):.3f}")
    print("Confusion Matrix (True vs Predicted):")
    print(confusion_matrix(y_true_all, y_pred_all))
    
    # 3. Unanswered sample statistics
    if answered_count < total_samples:
        unanswered_mask = ~data['answered']
        print("\n[Unanswered Samples]")
        print(f"Model Certainty Mean: {data[unanswered_mask]['certainty'].mean():.3f}")
        print(f"Model Certainty Min:  {data[unanswered_mask]['certainty'].min():.3f}")
        print(f"Prediction Accuracy:  {accuracy_score(y_true_all[unanswered_mask], y_pred_all[unanswered_mask]):.3f}")

else:
    print("No samples answered - no model trained")

# Generate visualization if we have a model
if answered_count > 0:
    answered_data = data[data['answered']]
    unanswered_data = data[~data['answered']]
    plot_results(answered_data, unanswered_data, final_model)

# Save processed data with predictions
processed_path = 'data_processed.csv'
data.to_csv(processed_path, index=False)
print(f"\nProcessed data with predictions saved to {processed_path}")

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'