In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import os
import time


# ======================
# CONSTANTS
# ======================

PREDICTOR1 = 'voicing'                    # first predictor column name
PREDICTOR2 = 'duration'                   # second predictor column name
TARGET = 'answer'                         # target column name
FILENAME_COL = 'filename'                 # filename column name
LABEL_MAPPING = {'s': 0, 'z': 1}          # binary output label mapping

DATA_PATH = 'data/data.csv'               # sound info data file path
PARTICIPANT_CSV_DIR = 'data/participants' # participant CSV directory
PROCESSED_PATH = 'data_processed.csv'     # processed data file path; leave blank to disable

INIT_RANDOM_SAMPLES = 10                  # initial random samples to collect
MIN_ITERATIONS = 20                       # minimum number of iterations
CLEANSER_FREQUENCY = 0                    # insert a high-certainty sample every nth iteration to prevent participant fatigue (irrelevant for virtual agents); 0 to disable
MODEL_CERTAINTY_CUTOFF = 0.95             # stopping certainty threshold
PARTICIPANT_TO_MODEL = 'p03'              # participant ID to simulate


# ======================
# DATA LOADING
# ======================

# load main experiment data
data = pd.read_csv(DATA_PATH)

# initialize tracking columns if they don't exist
if 'answered' not in data.columns:
    data['answered'] = False
if TARGET not in data.columns:
    data[TARGET] = np.nan


# ======================
# VIRTUAL AGENT FUNCTION
# ======================

def virtual_agent(filename, participant):
    """
    Simulates human participant responses using pre-recorded answers
    Returns the answer from a participant's CSV for a given filename
    """
    # load the participant's answer lookup table
    participant_path = os.path.join(PARTICIPANT_CSV_DIR, f"{participant}.csv")
    participant_data = pd.read_csv(participant_path)
    
    # find the matching row and return the answer
    match = participant_data[participant_data[FILENAME_COL] == filename]
    if not match.empty:
        raw_answer = match['answer_batch'].values[0]
        return LABEL_MAPPING[raw_answer]  # convert to numerical
    else:
        raise ValueError(f"Filename {filename} not found in {participant}'s data")


# ======================
# ACTIVE LEARNING SYSTEM
# ======================

def calculate_uncertainty(probs):
    """Calculate uncertainty as distance from decision boundary"""
    return 1 - np.maximum(probs, 1 - probs)

def plot_results(answered_data, unanswered_data, model):
    """Visualize results with decision boundary"""
    plt.figure(figsize = (10, 6))
    
    # convert boolean/categorical answers to numerical if needed
    if answered_data[TARGET].dtype == 'object':
        answered_data = answered_data.copy()
        answered_data[TARGET] = answered_data[TARGET].map(LABEL_MAPPING)  # Update with your actual labels
        
    # plot the answered points with proper color mapping
    if not answered_data.empty:
        scatter = plt.scatter(
            answered_data[PREDICTOR1], 
            answered_data[PREDICTOR2], 
            c = answered_data[TARGET], 
            cmap = 'coolwarm', 
            label = 'answered', 
            edgecolors = 'k',
            vmin = 0,  # ensure color scale matches binary classification
            vmax = 1
        )
        
    # Plot unanswered points if any remain
    if not unanswered_data.empty:
        plt.scatter(
            unanswered_data[PREDICTOR1], 
            unanswered_data[PREDICTOR2],
            c = 'gray', 
            alpha = 0.5, 
            label = 'Unanswered'
        )
    
    # Create decision boundary grid with proper feature names
    x_min, x_max = data[PREDICTOR1].min() - 1, data[PREDICTOR1].max() + 1
    y_min, y_max = data[PREDICTOR2].min() - 1, data[PREDICTOR2].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Create DataFrame with proper feature names
    grid_points = pd.DataFrame(
        np.c_[xx.ravel(), yy.ravel()],
        columns = [PREDICTOR1, PREDICTOR2]
    )
    
    Z = model.predict_proba(grid_points)[:, 1]
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, levels=20, cmap='coolwarm')
    plt.colorbar(scatter, label='Predicted Probability')
    plt.xlabel(PREDICTOR1)
    plt.ylabel(PREDICTOR2)
    plt.title('Virtual Agent Results (participant: {})'.format(PARTICIPANT_TO_MODEL))
    plt.legend()
    plt.show()


# ======================
# EXPERIMENT EXECUTION
# ======================

start_time = time.time()

# initial random sampling phase
print("Starting initial random sampling...")

def collect_sample():
    """Helper function to collect a random sample"""
    unanswered = data[~data['answered']]
    if unanswered.empty:
        return False
    
    random_sample = unanswered.sample(1)
    filename = random_sample[FILENAME_COL].values[0]
    
    # get virtual agent response
    answer = virtual_agent(filename, PARTICIPANT_TO_MODEL)
    
    # update dataframe
    data.loc[data[FILENAME_COL] == filename, TARGET] = answer
    data.loc[data[FILENAME_COL] == filename, 'answered'] = True
    return True

# collect minimum initial samples
samples_collected = 0
while samples_collected < INIT_RANDOM_SAMPLES:
    if not collect_sample():
        break  # no more samples available
    samples_collected += 1

# ensure class diversity
answered_data = data[data['answered']]
unique_classes = answered_data[TARGET].dropna().unique()

# continue sampling until we get at least 2 classes or run out of samples
while len(unique_classes) < 2 and not data[~data['answered']].empty:
    if not collect_sample():
        break
    answered_data = data[data['answered']]
    unique_classes = answered_data[TARGET].dropna().unique()

# check if we succeeded
if len(unique_classes) < 2:
    print("WARNING: Only one class present after initial sampling!")
    print("Cannot train model - exiting.")
    exit()

# uncertainty sampling phase
print("\nStarting active learning phase...")
iteration = INIT_RANDOM_SAMPLES

while True:
    # get training data
    answered_data = data[data['answered']]
    X_train = answered_data[[PREDICTOR1, PREDICTOR2]]
    y_train = answered_data[TARGET]
    
    # check if we have enough samples to train
    if len(y_train) < 2:
        print("Not enough samples to train model.")
        break
    
    # train logistic regression
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # get unlabeled samples
    unanswered_data = data[~data['answered']]
    if unanswered_data.empty:
        print("All samples have been answered.")
        break
    
    # calculate uncertainties
    X_unanswered = unanswered_data[[PREDICTOR1, PREDICTOR2]].copy()
    X_unanswered.columns = [PREDICTOR1, PREDICTOR2]  # explicit column naming to prevent error
    probs = model.predict_proba(X_unanswered)[:, 1]
    uncertainties = calculate_uncertainty(probs)
    
    # check if remaining samples meet certainty criteria
    if np.all(uncertainties <= (1 - MODEL_CERTAINTY_CUTOFF)) and iteration >= MIN_ITERATIONS:
        print(f"All remaining samples meet certainty threshold {MODEL_CERTAINTY_CUTOFF} after {iteration} iterations.")
        break

    if CLEANSER_FREQUENCY > 0 and (iteration - INIT_RANDOM_SAMPLES + 1) % CLEANSER_FREQUENCY == 0:
    # select the most certain sample (lowest uncertainty) as the cleanser
        min_uncertainty = uncertainties.min()
        candidates = unanswered_data[uncertainties == min_uncertainty]
        print(f"Iteration {iteration}: CLEANSER - selecting most certain sample.")
    else:
        # select the most uncertain sample (highest uncertainty)
        max_uncertainty = uncertainties.max()
        candidates = unanswered_data[uncertainties == max_uncertainty]

    selected_sample = candidates.sample(1)
    
    # get virtual agent response
    filename = selected_sample[FILENAME_COL].values[0]
    answer = virtual_agent(filename, PARTICIPANT_TO_MODEL)
    
    # update dataframe
    data.loc[data[FILENAME_COL] == filename, TARGET] = answer
    data.loc[data[FILENAME_COL] == filename, 'answered'] = True
    
    iteration += 1
    print(f"Iteration {iteration}: Selected {filename} with uncertainty {max_uncertainty:.3f}")


# ======================
# UPDATED FINAL RESULTS
# ======================

runtime = time.time() - start_time
answered_count = data['answered'].sum()
total_samples = len(data)

print("\n=== Experiment Results ===")
print(f"Total runtime: {runtime:.2f} seconds")
print(f"Answered samples: {answered_count}/{total_samples}")

# load TRUE answers from participant's lookup table
participant_path = os.path.join(PARTICIPANT_CSV_DIR, f"{PARTICIPANT_TO_MODEL}.csv")
true_answers = pd.read_csv(participant_path)[[FILENAME_COL, 'answer_batch']]
true_answers = true_answers.rename(columns={'answer_batch': 'true_answer'})

# merge true answers with our data
data = data.merge(true_answers, on=FILENAME_COL, how='left')
data['true_answer'] = data['true_answer'].map(LABEL_MAPPING)  # convert to numerical

if answered_count > 0:
    # train final model
    final_model = LogisticRegression()
    X_final = data[data['answered']][[PREDICTOR1, PREDICTOR2]]
    y_final = data[data['answered']][TARGET]
    final_model.fit(X_final, y_final)
    
    # add predictions to dataframe
    data['prediction'] = final_model.predict(data[[PREDICTOR1, PREDICTOR2]])
    data['certainty'] = np.max(final_model.predict_proba(data[[PREDICTOR1, PREDICTOR2]]), axis=1)
    
    # calculate performance metrics
    print("\n=== Model Performance ===")
    
    # on all samples (model predictions vs ground truth)
    y_true_all = data['true_answer']
    y_pred_all = data['prediction']
    
    print("\n[All Samples vs Lookup Table]")
    print(f"Overall Accuracy:  {accuracy_score(y_true_all, y_pred_all):.3f}")
    print(f"Precision: {precision_score(y_true_all, y_pred_all):.3f}")
    print(f"Recall:    {recall_score(y_true_all, y_pred_all):.3f}")
    print(f"F1 Score:  {f1_score(y_true_all, y_pred_all):.3f}")
    print("Confusion Matrix (True vs Predicted):")
    print(confusion_matrix(y_true_all, y_pred_all))
    
    # unanswered sample statistics
    if answered_count < total_samples:
        unanswered_mask = ~data['answered']
        print("\n[Unanswered Samples]")
        print(f"Model Certainty Mean: {data[unanswered_mask]['certainty'].mean():.3f}")
        print(f"Model Certainty Min:  {data[unanswered_mask]['certainty'].min():.3f}")
        print(f"Prediction Accuracy:  {accuracy_score(y_true_all[unanswered_mask], y_pred_all[unanswered_mask]):.3f}")

else:
    print("No samples answered - no model trained")

# generate visualization if we have a model
if answered_count > 0:
    answered_data = data[data['answered']]
    unanswered_data = data[~data['answered']]
    plot_results(answered_data, unanswered_data, final_model)

# save processed data with predictions
if PROCESSED_PATH:
    data.to_csv(processed_path, index = False)
    print(f"\nProcessed data with predictions saved to {processed_path}")
else:
    print("\nProcessed data with predictions not saved - PROCESSED_PATH is empty")