In [1]:
# pip install --upgrade torch transformers accelerate bitsandbytes

import os
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification  # Gemma3ForCausalLM

import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ExponentialLR

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

In [2]:
import json
with open('train.json', 'r') as file:
    data = json.load(file)

In [3]:
import json
with open('test.json', 'r') as file:
    data1 = json.load(file)

In [4]:
data_train = []
for i in range(0,len(data)):
    for j in data[i]['tutor_responses']:
        data_train.append({"conversation_history": data[i]['conversation_history'],
        "response": data[i]['tutor_responses'][j]['response'], "annotation":{"Mistake_Location":data[i]['tutor_responses'][j]['annotation']['Mistake_Location']}})

In [5]:
data_train1 = []
for i in range(0,len(data1)):
    for j in data1[i]['tutor_responses']:
        data_train1.append({"conversation_history": data1[i]['conversation_history'],
        "response": data1[i]['tutor_responses'][j]['response'],
         "annotation":{"Mistake_Location":data1[i]['tutor_responses'][j]['annotation']['Mistake_Location']}})

In [6]:
train_df = pd.DataFrame(data_train)
label_map = {
    "No": 0,
    "Yes": 1,
    "To some extent": 2  # or whatever the third label is
}

train_df["label"] = train_df["annotation"].apply(lambda x: label_map.get(x["Mistake_Location"], -1))


In [7]:
test_df = pd.DataFrame(data_train1)
test_df["label"] = test_df["annotation"].apply(lambda x: label_map.get(x["Mistake_Location"], -1))


In [8]:
test_df

Unnamed: 0,conversation_history,response,annotation,label
0,Student: 6 \n Tutor: Excellent answer! \n Tuto...,"It sounds like you're thinking of a ""polygon,""...",{'Mistake_Location': 'No'},0
1,Student: 6 \n Tutor: Excellent answer! \n Tuto...,"That's a good try! Actually, the name of the s...",{'Mistake_Location': 'Yes'},1
2,Student: 6 \n Tutor: Excellent answer! \n Tuto...,"Great effort! It's actually spelled ""polygon,""...",{'Mistake_Location': 'No'},0
3,Student: 6 \n Tutor: Excellent answer! \n Tuto...,"That's a great try, but the correct spelling i...",{'Mistake_Location': 'No'},0
4,Student: 6 \n Tutor: Excellent answer! \n Tuto...,That is a good try.,{'Mistake_Location': 'No'},0
...,...,...,...,...
404,"Tutor: Hi, could you please provide a step-by-...","I appreciate your reasoning, but let's take a ...",{'Mistake_Location': 'Yes'},1
405,"Tutor: Hi, could you please provide a step-by-...","That's true, but if there are 50 this week, ho...",{'Mistake_Location': 'No'},0
406,"Tutor: Hi, could you please provide a step-by-...",That's a great idea to try to figure out the n...,{'Mistake_Location': 'Yes'},1
407,"Tutor: Hi, could you please provide a step-by-...","I see what you're trying to do, but the questi...",{'Mistake_Location': 'Yes'},1


In [9]:
train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
train_df, test_df = train_test_split(train_df, test_size=0.15, random_state=42)

# df1 = train_df[train_df["actionability"] == "Yes"].sample(n=450)
# df2 = train_df[train_df["actionability"] == "No"].sample(n=400)
# df3 = train_df[train_df["actionability"] == "To some extent"].sample(n=328)
# train_df = pd.concat([df1, df2, df3], axis=0)

train_df.shape

(2104, 4)

In [10]:
train_df.label.value_counts()

1    1303
0     611
2     190
Name: label, dtype: int64

In [11]:
# test_df.conversation[9].split("\n")[-1]
# test_df["response"] = test_df["conversation"].apply(lambda x: x.split("\nTutor: ")[-1])
# train_df["response"] = train_df["conversation"].apply(lambda x: x.split("\nTutor: ")[-1])

In [12]:
def combine_inputs(row):
    return f"CONVERSATION: {row['conversation_history']} RESPONSE: {row['response']}"

train_df["text"] = train_df.apply(combine_inputs, axis=1)
test_df["text"] = test_df.apply(combine_inputs, axis=1)

In [13]:
class ConvoData(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.tokenized_data = []

        kwargs = {
            "max_length": 768,
            "padding": "max_length",
            "truncation": True,
            "return_attention_mask": True,
            "return_tensors": "pt"
        }

        for idx in range(len(df)):
            label = df.iloc[idx]["label"]
            resp = df.iloc[idx]["text"]
            tokens = tokenizer(resp, **kwargs)
            self.tokenized_data.append((tokens, label))

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        return self.tokenized_data[item]

In [14]:
model_id = "microsoft/Phi-4-mini-instruct"
# model_id = "unsloth/Meta-Llama-3.1-8B-Instruct"
# model_id = "unsloth/Qwen2.5-3B-Instruct"
# model_id = "unsloth/phi-4-bnb-4bit"
# model_id = "Qwen/Qwen2.5-7B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_16bit=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3,
    quantization_config=quantization_config,
    device_map="cuda:3",
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model.config.pad_token_id = model.config.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-4-mini-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# import matplotlib.pyplot as plt

# ls = train_df["response"].apply(lambda x: len(tokenizer(x).input_ids))
# plt.plot(range(len(ls)), ls)
# plt.show()

#### Training

In [15]:
MODEL_ABB = "phi4_mini_it"
LOG_PATH = "logs/training_log_phi4_less_data.txt"

In [16]:
# from torch.utils.data import WeightedRandomSampler

# labels = np.array(train_df.label)
# class_sample_counts = np.bincount(labels)  # count per class
# class_weights = 1. / class_sample_counts

# sample_weights = class_weights[labels]  # gives weight for each sample

# sampler = WeightedRandomSampler(
#     weights=torch.tensor(sample_weights, dtype=torch.double),
#     num_samples=len(sample_weights),  # total samples per epoch
#     replacement=True
# )

train_data = ConvoData(train_df, tokenizer)

train_dataloader = DataLoader(
    train_data,
    batch_size=4,
    # sampler=sampler
)

# tokenizer.batch_decode(train_data[1][0]["input_ids"])

In [17]:
from sklearn.metrics import f1_score

def eval_on_test(epoch, df, test_flag, write=True):
    model.eval()

    test_data = ConvoData(df, tokenizer)

    test_dataloader = DataLoader(
        test_data,
        batch_size=2,
        shuffle=False
    )

    true = []
    preds = []

    for batch_id, batch in enumerate(test_dataloader):
        input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

        with torch.no_grad():
            out = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=False
            )
            out = F.log_softmax(out.logits, dim=-1)

            true += batch[1].tolist()
            preds += out.argmax(dim=-1).tolist()

    if test_flag:
        text = "On test Set:"
    else:
        text = "On sampled train Set:"

    acc = sum([i == j for i, j in zip(preds, true)]) / len(test_data)
    f1 = f1_score(true, preds, average='macro')
    print(text)
    print("Accuracy:", acc)
    print("F1 score:", f1, "\n")
    
    if write:
        with open(LOG_PATH, "a") as file:
            w = f"""
            {text}
            Epoch: {epoch + 1}
            Accuracy: {acc},
            F1 score: {f1}\n
            """
            file.write(w)

    return f1

In [19]:
epochs = 5
accumulation_steps = 1

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = ExponentialLR(optimizer, gamma=0.9)

loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_id, batch in enumerate(train_dataloader):
        input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

        out = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )

        loss = loss_fn(out.logits, batch[1].to(device))
        loss.backward()

        if (batch_id + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if (batch_id + 1) % 25 == 0:
            print(loss.item())

        total_loss += loss.item()

    # If the number of batches isn't divisible by accumulation_steps,
    # do one final step after the loop ends
    if (batch_id + 1) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    scheduler.step()

    avg_loss = total_loss / len(train_dataloader)

    print(f"\nEpoch {epoch + 1}")
    print(f"Average Training Loss: {avg_loss:.4f}")

    eval_on_test(epoch, train_df.sample(n=400), test_flag=False)
    f1 = eval_on_test(epoch, test_df, test_flag=True)

    if f1 >= 0.2:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
        }
        torch.save(checkpoint, f"epoch_{epoch + 1}_{MODEL_ABB}_{f1}.pth")

0.6953125
0.29296875
0.94921875
0.2734375
1.0
0.96875
0.72265625
1.609375
1.546875
0.443359375
0.5234375
0.7421875
0.5234375
1.4921875
1.03125
0.466796875
0.984375
0.6171875
0.7890625
0.6328125

Epoch 1
Average Training Loss: 0.8451
On sampled train Set:
Accuracy: 0.6975
F1 score: 0.4554284676833696 

On test Set:
Accuracy: 0.6478494623655914
F1 score: 0.40561237093028996 

0.52734375
0.796875
0.37890625
0.7890625
1.1796875
0.62109375
1.59375
1.4921875
0.5078125
0.423828125
0.6796875
0.435546875
1.46875
0.91796875
0.4375
0.40234375
0.953125
0.50390625
0.765625
0.62109375

Epoch 2
Average Training Loss: 0.7786
On sampled train Set:
Accuracy: 0.745
F1 score: 0.48176507957925035 

On test Set:
Accuracy: 0.6532258064516129
F1 score: 0.41614477237197406 

0.42578125
0.26171875
0.703125
0.41796875
0.73046875
1.171875
1.5
1.4375
0.50390625
0.375
0.51171875
0.41796875
1.328125
0.9609375
0.38671875
0.609375
1.015625
0.578125
0.65234375
0.59765625

Epoch 3
Average Training Loss: 0.7480
On sample

In [17]:
# model.eval()

# test_data = ConvoData(train_df.sample(n=400), tokenizer)

# test_dataloader = DataLoader(
#     test_data,
#     batch_size=4,
#     shuffle=False
# )

# true = []
# preds = []

# for batch_id, batch in enumerate(test_dataloader):
#     input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
#     attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

#     out = model(
#         input_ids=input_ids,
#         attention_mask=attention_mask,
#         output_hidden_states=False
#     )
#     out = F.log_softmax(out.logits, dim=-1)

#     true += batch[1].tolist()
#     preds += out.argmax(dim=-1).tolist()

# acc = sum([i == j for i, j in zip(preds, true)]) / len(test_data)
# f1 = f1_score(true, preds, average='macro')
# print("Accuracy:", acc)
# print("F1 score:", f1)

Accuracy: 0.8525
F1 score: 0.8449993807406342


#### Test

In [20]:
checkpoint = torch.load(f"phi4/epoch_4_phi4_mini_it_5777.pth")

model.load_state_dict(checkpoint['model_state_dict'], strict=False)

print("Ok")

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [23]:
test_df

Unnamed: 0,conversation,actionability,label
1737,"Tutor: Hi, could you please provide a step-by-...",Yes,0
246,"Tutor: Hi, could you please provide a step-by-...",Yes,0
1565,Tutor: We need to subtract total windows from ...,Yes,0
432,"Tutor: Hi, could you please provide a step-by-...",Yes,0
56,"Tutor: Hi, could you please provide a step-by-...",No,1
...,...,...,...
1351,"Tutor: Hi, could you please provide a step-by-...",No,1
1316,"Tutor: Hi, could you please provide a step-by-...",To some extent,2
888,Tutor: Go ahead and solve this question. \n St...,No,1
1083,Tutor: We can compare the ratios more easily i...,No,1


In [43]:
model.eval()

test_data = ConvoData(test_df, tokenizer)

test_dataloader = DataLoader(
    test_data,
    batch_size=4,
    shuffle=False
)

true = []
preds = []

for batch_id, batch in enumerate(test_dataloader):
    input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
    attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=False
    )
    out = F.log_softmax(out.logits, dim=-1)

    true += batch[1].tolist()
    preds += out.argmax(dim=-1).tolist()


from sklearn.metrics import f1_score

print(sum([i == j for i, j in zip(preds, true)]) / len(test_data))
print("F1 score:", f1_score(true, preds, average='macro'))

0.0
F1 score: 0.0


In [14]:
# epoch_1_275_phi4_intrm_2.pth    :  0.65463
# epoch_1_275_phi4_intrm.pth      :  0.6727
# epoch_1_phi4_4b_ns_all.pth      :  0.6815
# epoch_2_phi4_intrm_2_174.pth    :

In [35]:
train_df.label.value_counts()

0    1190
1     710
2     328
Name: label, dtype: int64

### Subs

In [5]:
# quantization_config = BitsAndBytesConfig(load_in_16bit=True)

# model = AutoModelForSequenceClassification.from_pretrained(
#     "./ckpt_epoch20_qwen25",
#     num_labels=3,
#     quantization_config=quantization_config,
#     device_map="cuda:3",
#     torch_dtype=torch.bfloat16
# )

In [39]:
model.eval()

test_df = pd.read_csv("real_test.csv")
test_df["label"] = -1

test_data = ConvoData(test_df, tokenizer)

test_dataloader = DataLoader(
    test_data,
    batch_size=4,
    shuffle=False
)

true = []
preds = []

for batch_id, batch in enumerate(test_dataloader):
    input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
    attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=False
    )
    out = F.log_softmax(out.logits, dim=-1)

    # true += batch[1].tolist()
    preds += out.argmax(dim=-1).tolist()

test_df["preds"] = preds
test_df.to_csv("sub_bea.csv", index=False)

In [40]:
from collections import Counter

Counter(preds)

Counter({0: 717, 1: 425, 2: 405})

In [30]:
405 / 1547

0.26179702650290887

In [17]:
len(preds)

1547

In [41]:
import json

sub = pd.read_csv("sub_bea.csv")

sub_grouped = sub.groupby(by=["conversation_id", "conversation_history"]).agg(list)

label_dict = {
    "Yes": 0,
    "No": 1,
    "To some extent": 2
}

label_dict_rev = {v: k for k, v in label_dict.items()}

final = []

for row in sub_grouped.iterrows():
    d = {}
    id = row[0][0]
    hist = row[0][1]
    resp = {}
    for t in range(len(row[1]["tutor"])):
        resp[row[1]["tutor"][t]] = {
            "response": row[1]["response"][t],
            "annotation": {
                "Actionability": label_dict_rev[row[1]["preds"][t]]
            }
        }

    d["conversation_id"] = id
    d["conversation_history"] = hist
    d["tutor_responses"] = resp

    final.append(d)

json.dump(final, open("phi_4_ns_all.json", "w"))

In [20]:
len(train_dataloader)

259

In [21]:
batch_id

386

### why?

In [11]:
import json

model.eval()

test_df = pd.read_csv("test_sanity_bea.csv")

test_data = ConvoData(test_df, tokenizer)

test_dataloader = DataLoader(
    test_data,
    batch_size=4,
    shuffle=False
)

true = []
preds = []

for batch_id, batch in enumerate(test_dataloader):
    input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
    attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=False
    )
    out = F.log_softmax(out.logits, dim=-1)

    # true += batch[1].tolist()
    preds += out.argmax(dim=-1).tolist()

test_df["preds"] = preds

sub_grouped = test_df.groupby(by=["conversation_id", "conversation_history"]).agg(list)

label_dict = {
    "Yes": 0,
    "No": 1,
    "To some extent": 2
}

label_dict_rev = {v: k for k, v in label_dict.items()}

final = []

for row in sub_grouped.iterrows():
    d = {}
    id = row[0][0]
    hist = row[0][1]
    resp = {}
    for t in range(len(row[1]["tutor"])):
        resp[row[1]["tutor"][t]] = {
            "response": row[1]["response"][t],
            "annotation": {
                "Actionability": label_dict_rev[row[1]["preds"][t]]
            }
        }

    d["conversation_id"] = id
    d["conversation_history"] = hist
    d["tutor_responses"] = resp

    final.append(d)

json.dump(final, open("test_sanity.json", "w"))

In [16]:
sub_grouped.iloc[5]

tutor       [Llama31405B, Phi3, Expert, Gemini, GPT4, Llam...
response    [It looks like everything makes sense up until...
preds                                [2, 0, 0, 1, 0, 0, 0, 0]
Name: (1490-cc82c3fe-3e61-4189-acda-2138219edee1, Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Kenny is selling his Pokemon cards to buy a ticket to an amusement park, which costs $100. He has a collection of cards and plans to sell them for $1.5 each. He keeps 1/3 of them and gets to go to the amusement park with $50 in spending cash. How many cards did he start with? \n Student: Let x be the total number of cards Kenny started with.\nHe kept 1/3 of them, so he sold 2/3 of them, which is 2/3x.\nHe sold them for $1.5 each, so he earned 1.5(2/3x) = 3x/2 dollars.\nHe wants to end up with $50 spending cash, so he needs to earn $100 + $50 = $150 in total.\nTherefore, 3x/2 = $150.\nSolving for x, we get x = 100.\nKenny started with 100 cards.\n 100 \n T

In [18]:
test_df

Unnamed: 0,conversation_id,conversation_history,tutor,response,preds
0,296362576,Student: 6 \n Tutor: Excellent answer! \n Tuto...,Mistral,"It sounds like you're thinking of a ""polygon,""...",0
1,296362576,Student: 6 \n Tutor: Excellent answer! \n Tuto...,Phi3,"That's a good try! Actually, the name of the s...",0
2,296362576,Student: 6 \n Tutor: Excellent answer! \n Tuto...,Sonnet,"Great effort! It's actually spelled ""polygon,""...",0
3,296362576,Student: 6 \n Tutor: Excellent answer! \n Tuto...,GPT4,"That's a great try, but the correct spelling i...",0
4,296362576,Student: 6 \n Tutor: Excellent answer! \n Tuto...,Novice,That is a good try.,2
...,...,...,...,...,...
404,1990-60fb8cf6-06e1-4267-80c8-862885d63d9e,"Tutor: Hi, could you please provide a step-by-...",Sonnet,"I appreciate your reasoning, but let's take a ...",1
405,1990-60fb8cf6-06e1-4267-80c8-862885d63d9e,"Tutor: Hi, could you please provide a step-by-...",Expert,"That's true, but if there are 50 this week, ho...",1
406,1990-60fb8cf6-06e1-4267-80c8-862885d63d9e,"Tutor: Hi, could you please provide a step-by-...",Llama31405B,That's a great idea to try to figure out the n...,0
407,1990-60fb8cf6-06e1-4267-80c8-862885d63d9e,"Tutor: Hi, could you please provide a step-by-...",Llama318B,"I see what you're trying to do, but the questi...",1


In [21]:
test_df = pd.read_csv("test_bea.csv")
test_df.head(5)

Unnamed: 0,conversation,actionability
0,Student: ok i got the answer \n Tutor: You hav...,Yes
1,"Tutor: Hi, could you please provide a step-by-...",No
2,"Tutor: Hi, could you please provide a step-by-...",Yes
3,"Tutor: Hi, could you please provide a step-by...",Yes
4,Tutor: Consider the first digit in the given n...,Yes


In [None]:
model.eval()

test_data = ConvoData(test_df, tokenizer)

test_dataloader = DataLoader(
    test_data,
    batch_size=4,
    shuffle=False
)

true = []
preds = []

for batch_id, batch in enumerate(test_dataloader):
    input_ids = batch[0]['input_ids'].squeeze(dim=1).to(device)
    attention_mask = batch[0]['attention_mask'].squeeze(dim=1).to(device)

    out = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        output_hidden_states=False
    )
    out = F.log_softmax(out.logits, dim=-1)

    true += batch[1].tolist()
    preds += out.argmax(dim=-1).tolist()

print(sum([i == j for i, j in zip(preds, true)]) / len(test_data))

f1 = f1_score(true, preds, average='macro')
print("F1 score:", f1)