In [1]:
import numpy as np
def bootstrap_ci(y_test, y_pred, calculate_metric, confidence=0.9, n_resamples=1000, sample_fraction=0.95, random_state=None):
    """
    Compute bootstrap confidence interval for a metric function.
    Parameters:
    - y_test: array-like, true labels.
    - y_pred: array-like, predicted labels.
    - calculate_metric: function, computes the metric given y_test and y_pred.
    - confidence: float, confidence level (default 0.9).
    - n_resamples: int, number of bootstrap resamples (default 100).
    - sample_fraction: float, fraction of data to sample in each resample (default 0.95).
    - random_state: int or None, random seed for reproducibility.
    Returns:
    - (lower_bound, upper_bound): tuple of floats, confidence interval bounds.
    """
    rng = np.random.default_rng(random_state)
    n = len(y_test)
    sample_size = int(n * sample_fraction)
    metrics = []
    for _ in range(n_resamples):
        indices = rng.choice(n, size=sample_size, replace=True)
        metric = calculate_metric(np.array(y_test)[indices], np.array(y_pred)[indices])
        metrics.append(metric)
    alpha = (1 - confidence) / 2
    lower = np.percentile(metrics, 100 * alpha)
    upper = np.percentile(metrics, 100 * (1 - alpha))
    return {"metric": round(calculate_metric(y_test, y_pred), 4), "confidence_interval": (round(lower, 4), round(upper, 4))}

# def calculate_metric(y_true, y_pred):
#     # Example: Mean Absolute Error
#     return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))
# y_test = [1, 2, 3, 4, 5]
# y_pred = [1.1, 1.9, 3.2, 3.8, 5.1]

# print(bootstrap_ci(y_test, y_pred, calculate_metric, confidence=0.9))

# def accuracy_metric(y_true, y_pred):
#     return np.mean(y_true == y_pred)
# bootstrap_ci(true_labels, preds, accuracy_metric)

## Retrain with some edits 

In [3]:
import pandas as pd
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your dataset and drop rows with missing responses
df = pd.read_csv("./mrbench_v3_devset_train_data.csv").dropna(subset=["response"])

# Convert string labels to integer indices
# Create mapping dictionaries for encoding and later decoding
label_list = sorted(df["label"].unique())
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Replace string labels with integer labels in the DataFrame
# Split by conversation_id
conversation_ids = df["conversation_id"].unique()
train_ids, test_ids = train_test_split(conversation_ids, test_size=0.2, random_state=42)


train_df = df[df["conversation_id"].isin(train_ids)].reset_index(drop=True)
test_df = df[df["conversation_id"].isin(test_ids)].reset_index(drop=True)


train_texts = [f"{c} </s> <s> {r} </s>" for c, r in zip(train_df["conversation_history"].tolist(), train_df["response"].tolist())]
test_texts = [f"{c} </s> <s> {r} </s>" for c, r in zip(train_df["conversation_history"].tolist(), test_df["response"].tolist())]

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_df["label"].tolist()})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_df["label"].tolist()})

# Initialize tokenizer & model
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(label_list),
    # attention_probs_dropout_prob=0.2,
    # hidden_dropout_prob=0.2
)
model.to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Compute metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # evaluates at the end of each epoch
    save_strategy="epoch",          # saves checkpoint at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    # dropout=0.3
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
torch.cuda.empty_cache()
trainer.train()

# Evaluate the model
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Generate detailed classification report (using integer labels)
print("Detailed Classification Report (Integer Labels):")
print(classification_report(true_labels, preds, digits=4))

# Generate classification report with target names (map back to original string labels)
target_names = [id_to_label[i] for i in range(len(label_list))]
print("Detailed Classification Report (String Labels):")
print(classification_report(true_labels, preds, target_names=target_names, digits=4))

# --- Save the Fine-tuned Model ---
# Before saving, update the model configuration with label mappings so that id2label and label2id
# are stored in the saved config.json. This is necessary because categorical labels are not handled by default.
model.config.id2label = id_to_label
model.config.label2id = label_to_id

# Save the model and tokenizer; the config will be saved automatically
save_directory = "./saved_model"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Using device: cuda


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1652 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [9]:
import json
with open("./mrbench_v3_testset.json", "r") as f:
    test_set_json = json.load(f)

data = []
for entry in test_set_json:
    for key in entry["tutor_responses"]:
        data.append([entry["conversation_id"], entry["conversation_history"], key, entry["tutor_responses"][key]["response"]])
df = pd.DataFrame(data, columns=["conversation_id", "conversation_history", "tutor_id", "tutor_response"])

with open("./saved_model/config.json", "r") as f:
    id2label = json.load(f)["id2label"]

# Now make Hugging Face Dataset objects manually
test_texts = [f"{c} </s> <s> {r} </s>" for c, r in zip(df["conversation_history"].tolist(), df["tutor_response"].tolist())]
test_dataset = Dataset.from_dict({"text": test_texts, "label": [0] * len(test_texts)})
print(test_texts[:2])

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Run predictions
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Map prediction indices to labels
pred_labels = [id2label[str(label)] for label in preds]

# Add predictions to the DataFrame for inspection
df["predicted_label"] = pred_labels

# df.groupby(["conversation_id", "conversation_history"]).apply(lambda x: )

with open("./saved_model/predictions.json", "w") as f:
    json.dump(df.groupby(["conversation_id", "conversation_history"], sort=False).apply(
        lambda group: {
            "conversation_id": group.name[0],
            "conversation_history": group.name[1],
            "tutor_responses": {
                f"Tutor_{i+1}": {
                    "response": row["tutor_response"],
                    "annotation": {"Tutor_Identification": row["predicted_label"]}
                }
                for i, row in enumerate(group.to_dict("records"))
            }
        }
    ).tolist(), f)

["Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.  Each sandwich required 1 pound each of meat and cheese and would serve 4 people.  There would be 20 people in total watching the game.  The meat cost $7.00 per pound and the cheese cost $3.00 per pound.  How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people? \n Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n 100 \n Tutor: do you want to talk me through your solution \n Student: Yes I think my answer is correct. I used 10 pounds of meat and 10 pounds of cheese so I

Map: 100%|██████████| 1547/1547 [00:04<00:00, 347.03 examples/s]


  json.dump(df.groupby(["conversation_id", "conversation_history"], sort=False).apply(


## Inference on the unlabeled test set and submission (first run)

In [None]:
import json
import pandas as pd
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

with open("./ACL/mrbench_v3_testset.json", "r") as f:
    test_set_json = json.load(f)

data = []
for entry in test_set_json:
    for key in entry["tutor_responses"]:
        data.append([entry["conversation_id"], entry["conversation_history"], key, entry["tutor_responses"][key]])
df = pd.DataFrame(data, columns=["conversation_id", "conversation_history", "tutor_id", "tutor_response"])

    # {
    #     "conversation_id": "221-362eb11a-f190-42a6-b2a4-985fafdcfa9e",
    #     "conversation_history": "Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.  Each sandwich required 1 pound each of meat and cheese and would serve 4 people.  There would be 20 people in total watching the game.  The meat cost $7.00 per pound and the cheese cost $3.00 per pound.  How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people? \n Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n 100 \n Tutor: How many pounds of meat are needed for each sandwich? \n Student: Each sandwich requires 1 pound of meat and 1 pound of cheese. \n Tutor: What is the cost of 1 pound of meat? \n Student: The cost of 1 pound of meat is $7.00.",
    #     "tutor_responses": {
    #         "Sonnet": {
    #             "response": "Great, you've correctly identified the cost of the meat, now let's focus on calculating the total cost of meat for all the sandwiches needed.",
    #             "annotation": {
    #                 "Mistake_Identification": "Yes",
    #                 "Mistake_Location": "Yes",
    #                 "Providing_Guidance": "Yes",
    #                 "Actionability": "Yes"
    #             }
    #         },
    #         "Llama318B": {
    #             "response": "Now that we know the cost of 1 pound of meat is $7.00, we can use this information to find the correct total cost for the meat and cheese needed for the muffaletta sandwiches.",
    #             "annotation": {
    #                 "Mistake_Identification": "Yes",
    #                 "Mistake_Location": "To some extent",
    #                 "Providing_Guidance": "To some extent",
    #                 "Actionability": "To some extent"
    #             }
    #         },
    #         "Llama31405B": {
    #             "response": "You're close, but I notice that you calculated the cost of 10 pounds of meat, when actually 5 pounds of meat are needed for 5 sandwiches, since each sandwich requires 1 pound of meat; can you recalculate the cost of the meat?",
    #             "annota


# Convert string labels to integer indices
# Create mapping dictionaries for encoding and later decoding

model_name = "./ACL/saved_model"
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerForSequenceClassification.from_pretrained(model_name, num_labels=9)
model.to("cuda")

with open("./ACL/saved_model/config.json", "r") as f:
    id2label = json.load(f)["id2label"]

# Now make Hugging Face Dataset objects manually
test_texts = [f"Conversation: {c}\n\nResponse: {r}" for c, r in zip(df["conversation_history"].tolist(), df["tutor_response"].tolist())]
test_dataset = Dataset.from_dict({"text": test_texts, "label": [0] * len(test_texts)})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=4096
    )

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])



# Initialize Trainer
trainer = Trainer(model=model, args=TrainingArguments(output_dir="./results_eval", per_device_eval_batch_size=2, report_to="none"))

# Run predictions
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Map prediction indices to labels
pred_labels = [id2label[str(label)] for label in preds]

# Add predictions to the DataFrame for inspection
df["predicted_label"] = pred_labels

# df.groupby(["conversation_id", "conversation_history"]).apply(lambda x: )

with open("./ACL/predictions.json", "w") as f:
    json.dump(df.groupby(["conversation_id", "conversation_history"], sort=False).apply(
        lambda group: {
            "conversation_id": group.name[0],
            "conversation_history": group.name[1],
            "tutor_responses": {
                f"Tutor_{i+1}": {
                    "response": row["tutor_response"]["response"],
                    "annotation": {"Tutor_Identification": row["predicted_label"]}
                }
                for i, row in enumerate(group.to_dict("records"))
            }
        }
    ).tolist(), f)