In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from playsound import playsound
import os
import time

# ======================
# CONSTANTS
# ======================

PREDICTOR1 = 'voicing'
PREDICTOR2 = 'duration'
TARGET = 'answer'
FILENAME_COL = 'filename'
LABEL_MAPPING = {'s': 0, 'z': 1}

DATA_PATH = 'data/data.csv'
AUDIO_FOLDER = 'data/audio'
PROCESSED_PATH = 'data_processed.csv'

INIT_RANDOM_SAMPLES = 10
MIN_ITERATIONS = 20
MODEL_CERTAINTY_CUTOFF = 0.95


# ======================
# DATA LOADING
# ======================

data = pd.read_csv(DATA_PATH)

if 'answered' not in data.columns:
    data['answered'] = False
if TARGET not in data.columns:
    data[TARGET] = np.nan


# ======================
# HELPER FUNCTIONS
# ======================

def get_human_response(filename):
    filepath = os.path.join(AUDIO_FOLDER, filename)
    if not os.path.exists(filepath):
        print(f"Missing file: {filepath}. Skipping.")
        return None

    while True:
        input(f"\nReady to hear the sound '{filename}'? Press Enter to play...")
        try:
            playsound(filepath)
        except Exception as e:
            print(f"Error playing sound: {e}")
            return None

        response = input("Enter your response ('s' or 'z'): ").strip().lower()
        if response in LABEL_MAPPING:
            return LABEL_MAPPING[response]
        else:
            print("Invalid input. Please enter 's' or 'z'.")

def calculate_uncertainty(probs):
    return 1 - np.maximum(probs, 1 - probs)

def plot_results(answered_data, unanswered_data, model):
    plt.figure(figsize=(10, 6))
    
    if answered_data[TARGET].dtype == 'object':
        answered_data = answered_data.copy()
        answered_data[TARGET] = answered_data[TARGET].map(LABEL_MAPPING)
    
    if not answered_data.empty:
        scatter = plt.scatter(
            answered_data[PREDICTOR1], 
            answered_data[PREDICTOR2], 
            c = answered_data[TARGET], 
            cmap = 'coolwarm', 
            edgecolors = 'k',
            label = 'Answered',
            vmin = 0, vmax = 1
        )
    
    if not unanswered_data.empty:
        plt.scatter(
            unanswered_data[PREDICTOR1],
            unanswered_data[PREDICTOR2],
            c='gray', alpha=0.5, label='Unanswered'
        )
    
    x_min, x_max = data[PREDICTOR1].min() - 1, data[PREDICTOR1].max() + 1
    y_min, y_max = data[PREDICTOR2].min() - 1, data[PREDICTOR2].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    grid_points = pd.DataFrame(
        np.c_[xx.ravel(), yy.ravel()],
        columns = [PREDICTOR1, PREDICTOR2]
    )
    Z = model.predict_proba(grid_points)[:, 1].reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.3, levels=20, cmap='coolwarm')
    plt.colorbar(scatter, label='Predicted Probability')
    plt.xlabel(PREDICTOR1)
    plt.ylabel(PREDICTOR2)
    plt.title('Human Experiment Results')
    plt.legend()
    plt.show()


# ======================
# EXPERIMENT EXECUTION
# ======================

start_time = time.time()

print("Starting initial random sampling...")

def collect_sample():
    unanswered = data[~data['answered']]
    if unanswered.empty:
        return False
    sample = unanswered.sample(1)
    filename = sample[FILENAME_COL].values[0]
    answer = get_human_response(filename)
    if answer is not None:
        data.loc[data[FILENAME_COL] == filename, TARGET] = answer
        data.loc[data[FILENAME_COL] == filename, 'answered'] = True
        return True
    return False

samples_collected = 0
while samples_collected < INIT_RANDOM_SAMPLES:
    if not collect_sample():
        break
    samples_collected += 1

answered_data = data[data['answered']]
unique_classes = answered_data[TARGET].dropna().unique()

while len(unique_classes) < 2 and not data[~data['answered']].empty:
    if not collect_sample():
        break
    answered_data = data[data['answered']]
    unique_classes = answered_data[TARGET].dropna().unique()

if len(unique_classes) < 2:
    print("WARNING: Only one class after initial sampling. Exiting.")
    exit()

print("\nStarting active learning phase...")
iteration = INIT_RANDOM_SAMPLES

# Active learning loop
while True:
    answered_data = data[data['answered']]
    X_train = answered_data[[PREDICTOR1, PREDICTOR2]]
    y_train = answered_data[TARGET]
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    unanswered_data = data[~data['answered']]
    if unanswered_data.empty:
        break

    X_unanswered = unanswered_data[[PREDICTOR1, PREDICTOR2]]
    probs = model.predict_proba(X_unanswered)[:, 1]
    uncertainties = calculate_uncertainty(probs)

    # Stop AL phase but continue data collection
    if np.all(uncertainties <= (1 - MODEL_CERTAINTY_CUTOFF)) and iteration >= MIN_ITERATIONS:
        print(f"\nCertainty threshold reached after {iteration} iterations.")
        print("Continuing to label remaining data for evaluation...\n")
        break

    max_uncertainty = uncertainties.max()
    candidates = unanswered_data[uncertainties == max_uncertainty]
    selected_sample = candidates.sample(1)
    filename = selected_sample[FILENAME_COL].values[0]
    answer = get_human_response(filename)

    if answer is not None:
        data.loc[data[FILENAME_COL] == filename, TARGET] = answer
        data.loc[data[FILENAME_COL] == filename, 'answered'] = True
        iteration += 1
        print(f"Iteration {iteration}: Selected {filename} with uncertainty {max_uncertainty:.3f}")

# Label remaining samples
for filename in data[~data['answered']][FILENAME_COL]:
    answer = get_human_response(filename)
    if answer is not None:
        data.loc[data[FILENAME_COL] == filename, TARGET] = answer
        data.loc[data[FILENAME_COL] == filename, 'answered'] = True

# Final Model Training & Evaluation
runtime = time.time() - start_time
answered_count = data['answered'].sum()
total_samples = len(data)

print("\n=== Experiment Complete ===")
print(f"Runtime: {runtime:.2f}s")
print(f"Answered: {answered_count}/{total_samples}")

final_model = LogisticRegression()
X_final = data[[PREDICTOR1, PREDICTOR2]]
y_final = data[TARGET]
final_model.fit(X_final, y_final)

data['prediction'] = final_model.predict(X_final)
data['certainty'] = final_model.predict_proba(X_final).max(axis=1)

print("\n=== Model Performance (on participant answers) ===")
print(f"Accuracy:  {accuracy_score(y_final, data['prediction']):.3f}")
print(f"Precision: {precision_score(y_final, data['prediction']):.3f}")
print(f"Recall:    {recall_score(y_final, data['prediction']):.3f}")
print(f"F1 Score:  {f1_score(y_final, data['prediction']):.3f}")
print("Confusion Matrix:")
print(confusion_matrix(y_final, data['prediction']))

# Plot and save
plot_results(data[data['answered']], pd.DataFrame(), final_model)

if PROCESSED_PATH:
    data.to_csv(PROCESSED_PATH, index=False)
    print(f"\nData saved to {PROCESSED_PATH}")