In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
)
from datasets import Dataset, load_dataset
import evaluate
import os
from sklearn.metrics import classification_report, confusion_matrix
import wandb
import torch

In [5]:
# torch.cuda_set_device(1)
torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [None]:
# Configuration
BASE_MODEL_NAME = "Alibaba-NLP/gte-large-en-v1.5"
DATASET_PATH = "./classified_pdfs_50k.csv"
CHECKPOINT_DIR = "./checkpoints"
THRESHOLD = 100  # Minimum number of samples per class

# Load and preprocess the data
df = pd.read_csv(DATASET_PATH)
value_counts = df['classification'].value_counts()
classes_to_keep = value_counts[value_counts >= THRESHOLD].index.tolist()
df = df[df['classification'].isin(classes_to_keep)]

# Create label mappings
class_to_label = {cls: i for i, cls in enumerate(classes_to_keep)}
label_to_class = {i: cls for cls, i in class_to_label.items()}

In [None]:
# Prepare the dataset
texts = df['url'].tolist()
labels = df['classification'].map(class_to_label).tolist()

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, stratify=labels, random_state=42
)

# Create datasets
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_NAME, 
    num_labels=len(class_to_label)
)

In [None]:
def compute_metrics(eval_pred):
    precision_metric = evaluate.load("precision")
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision = precision_metric.compute(
        predictions=preds, references=labels, average="macro"
    )["precision"]
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    
    report = classification_report(labels, preds)
    cm = confusion_matrix(labels, preds)
    #print("Validation Report:\n" + report)
    #print("Confusion Matrix:\n" + str(cm))
    wandb.log({
        "accuracy": accuracy,
         "f1_macro": f1,
    })
    return {
        "f1_macro": f1,
        "accuracy": accuracy,
    }

In [None]:
for p in model.base_model.parameters():
    p.requires_grad = False

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable layer: {name}")

In [None]:
training_args = TrainingArguments(
    output_dir=CHECKPOINT_DIR,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=100,
    learning_rate=3e-4,
    num_train_epochs=5,
    seed=42,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="wandb",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
wandb.init(project="pdf-classification")

In [None]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(os.path.join(CHECKPOINT_DIR, "final"))

In [None]:
wandb.finish()

In [None]:
# Save the label mappings
import json
with open(os.path.join(CHECKPOINT_DIR, 'label_mappings.json'), 'w') as f:
    json.dump({
        'class_to_label': class_to_label,
        'label_to_class': label_to_class
    }, f)

print("Training completed. Model and label mappings saved.")