In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
import json
with open('train.json', 'r') as file:
    data = json.load(file)

In [5]:
data[0]['tutor_responses']['Sonnet']

{'response': "Great, you've correctly identified the cost of the meat, now let's focus on calculating the total cost of meat for all the sandwiches needed.",
 'annotation': {'Mistake_Identification': 'Yes',
  'Mistake_Location': 'Yes',
  'Providing_Guidance': 'Yes',
  'Actionability': 'Yes'}}

In [6]:
data_train = []
for i in range(0,len(data)):
    for j in data[i]['tutor_responses']:
        data_train.append({"conversation_history": data[i]['conversation_history'],
        "response": data[i]['tutor_responses'][j]['response'],
         "annotation":{"Mistake_Location":data[i]['tutor_responses'][j]['annotation']['Mistake_Location']}})

In [11]:
# Sample data (replace with your full dataset)
"""data = [
    {
        "conversation_history": "Tutor: Please explain... Student: ...",
        "annotation": {"Mistake_Location": "Yes"}
    },
    {
        "conversation_history": "Tutor: Can you try again... Student: ...",
        "annotation": {"Mistake_Location": "No"}
    },
    # Add more examples...
]
"""
# Convert to DataFrame
df = pd.DataFrame(data_train)
label_map = {
    "No": 0,
    "Yes": 1,
    "To some extent": 2  # or whatever the third label is
}

df["label"] = df["annotation"].apply(lambda x: label_map.get(x["Mistake_Location"], -1))



In [12]:
df

Unnamed: 0,conversation_history,response,annotation,label
0,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",{'Mistake_Location': 'Yes'},1
1,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,{'Mistake_Location': 'To some extent'},2
2,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",{'Mistake_Location': 'Yes'},1
3,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",{'Mistake_Location': 'Yes'},1
4,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,{'Mistake_Location': 'Yes'},1
...,...,...,...,...
2062,Tutor: We can compare the ratios more easily i...,"That's close, but the correct decimal value of...",{'Mistake_Location': 'Yes'},1
2063,Tutor: We can compare the ratios more easily i...,"Tutor: I see where you're going, but when we c...",{'Mistake_Location': 'Yes'},1
2064,Tutor: We can compare the ratios more easily i...,"I appreciate your attempt, but let's work toge...",{'Mistake_Location': 'Yes'},1
2065,Tutor: We can compare the ratios more easily i...,Let's try that again. To convert 2/3 to a deci...,{'Mistake_Location': 'Yes'},1


In [13]:
# Combine the input columns
def combine_inputs(row):
    return f"CONVERSATION: {row['conversation_history']} RESPONSE: {row['response']}"

df["text"] = df.apply(combine_inputs, axis=1)

# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)


In [15]:
train_texts[0]

"CONVERSATION: Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Every month, Madeline has to buy food, treats, and medicine for her dog. Food costs $25 per week. Treats cost $20 per month. Medicine costs $100 per month. How much money does Madeline spend on her dog per year if there are 4 weeks in a month?\xa0\n\xa0Student: In a month, Madeline spends 4 x $25 = $100 on food for her dog.\nIn a year, Madeline spends 12 x $100 = $1200 on medicine for her dog.\nIn a year, Madeline spends 12 x $20 = $240 on treats for her dog.\nTherefore, Madeline spends a total of $100 + $1200 + $240 = $1540 on her dog per year.\n\xa01540\xa0\n\xa0Tutor: Stephanie, please look at your workings out for the food amount. is your answer relevant to cost per year or month?\xa0\n\xa0Student: Oh, I see. My answer is relevant to cost per month, not per year. So, I should multiply the cost per month by 12 to get the cost per year.\xa0\n\xa0Tutor: brilliant, what w

In [18]:
# Tokenize
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Dataset class
class MistakeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = MistakeDataset(train_encodings, train_labels)
val_dataset = MistakeDataset(val_encodings, val_labels)

# Load model for 3-class classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3, from_tf=True)

# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


OSError: roberta-base does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [19]:

# Combine the conversation and response columns
def combine_inputs(row):
    return f"CONVERSATION: {row['conversation_history']} RESPONSE: {row['response']}"

df["text"] = df.apply(combine_inputs, axis=1)

# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)

# Dataset class
class MistakeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = MistakeDataset(train_encodings, train_labels)
val_dataset = MistakeDataset(val_encodings, val_labels)

# Load the model for 3-class classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Check if multiple GPUs are available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# If multiple GPUs are available, use DataParallel
if torch.cuda.device_count() > 1:
    print(f"Training on {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

model.to(device)

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate after each epoch
    per_device_train_batch_size=2,  # Decrease batch size for memory
    per_device_eval_batch_size=2,   # Decrease batch size for memory
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",  # Log directory
    logging_steps=10,
    save_strategy="epoch",  # Save model at each epoch
    fp16=True,  # Use mixed precision to save memory and speed up training
    no_cuda=False,  # Make sure it runs on GPU
    dataloader_num_workers=4,  # Increase for faster data loading if using a multi-core CPU
    report_to="none",  # Disable reporting to platforms like wandb if not needed
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Save the model
model.save_pretrained("./mistake_detection_model")
tokenizer.save_pretrained("./mistake_detection_model")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


OSError: bert-base-uncased does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.