In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load tokenizer and model
model_name = "google/mathbert-base-uncased"  # Replace with correct model if needed
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels!

# Load your custom dataset
def load_dataset_from_csv(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return Dataset.from_pandas(train_df), Dataset.from_pandas(test_df)

train_dataset, eval_dataset = load_dataset_from_csv(
    "train_dev1.csv", "dev1.csv"
)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

# Set PyTorch format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


OSError: google/mathbert-base-uncased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./mathbert_model",
    num_train_epochs=4,
    per_device_train_batch_size=8,        # Per GPU
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none",  # Disable W&B/TensorBoard unless needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train and save
trainer.train()
trainer.save_model("./mathbert_trained_model")
tokenizer.save_pretrained("./mathbert_trained_model")

print("✅ Training complete. Model saved.")
