In [1]:
!pip install transformers[torch]==4.38.0 datasets pandas scikit-learn tabulate accelerate==0.27.2

Collecting transformers==4.38.0 (from transformers[torch]==4.38.0)
[0m  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m893.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from transformers==4.38.0->transformers[torch]==4.3

In [1]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.3/68.3 kB[0m [31m798.8 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: kagglehub
Successfully installed kagglehub-0.3.13


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ivanmitriakhin/arxiv-titles-abstracts-and-tags")
path2 = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts")


print("Path to dataset files:", path)
print("Path to dataset files:", path2)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/ivanmitriakhin/arxiv-titles-abstracts-and-tags?dataset_version_number=5...


100%|██████████| 860M/860M [00:33<00:00, 27.2MB/s] 

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/spsayakpaul/arxiv-paper-abstracts?dataset_version_number=2...


100%|██████████| 44.6M/44.6M [00:02<00:00, 15.6MB/s]

Extracting files...





Path to dataset files: /home/raymond/.cache/kagglehub/datasets/ivanmitriakhin/arxiv-titles-abstracts-and-tags/versions/5
Path to dataset files: /home/raymond/.cache/kagglehub/datasets/spsayakpaul/arxiv-paper-abstracts/versions/2


In [1]:
import pandas as pd
import torch
from datasets import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import re
import transformers
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. Configuration ---
file_path = 'ivanmitriakhin/arxiv-titles-abstracts-and-tags/versions/5/arxiv_data_grouped.csv'
title_column = 'titles'
abstract_column = 'abstracts'
MODEL_CHECKPOINT = 'allenai/scibert_scivocab_uncased'
output_model_dir = './fine-tuned-scibert-multilabel'

In [3]:
# --- 2. Data Loading and Cleaning ---
print(f"Loading data from: {file_path}")
df = pd.read_csv(file_path)

def clean_text(text):
    if not isinstance(text, str): 
        return ""
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

print("Cleaning text data...")
df[title_column] = df[title_column].apply(clean_text)
df[abstract_column] = df[abstract_column].apply(clean_text)

Loading data from: ivanmitriakhin/arxiv-titles-abstracts-and-tags/versions/5/arxiv_data_grouped.csv
Cleaning text data...


In [4]:
# --- 3. Data Preprocessing ---
first_label_index = df.columns.get_loc(abstract_column) + 1
label_columns = df.columns[first_label_index:].tolist()
df['text'] = df[title_column] + " [SEP] " + df[abstract_column]

# Convert labels to float32 numpy arrays (this ensures proper dtype)
df['labels'] = df[label_columns].values.astype(np.float32).tolist()
df_clean = df[['text', 'labels']]

print(f"Dataset shape: {df_clean.shape}")
print(f"Number of label classes: {len(label_columns)}")
print(f"Sample labels: {df_clean['labels'].iloc[0]}")

Dataset shape: (536914, 2)
Number of label classes: 8
Sample labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


In [5]:
# --- 4. Tokenization ---
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


Loading tokenizer...




In [6]:
# --- 5. Dataset Creation and Processing ---
print("Creating dataset...")
full_dataset = Dataset.from_pandas(df_clean)

print("Tokenizing dataset...")
tokenized_dataset = full_dataset.map(tokenize_function, batched=True)

# Remove text column (no longer needed)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# Set format to torch tensors (this handles the float32 conversion automatically)
print("Setting dataset format to PyTorch tensors...")
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Verify the conversion worked
sample = tokenized_dataset[0]
print(f"Labels type: {type(sample['labels'])}")
print(f"Labels dtype: {sample['labels'].dtype}")
print(f"Sample labels: {sample['labels']}")

Creating dataset...
Tokenizing dataset...


Map: 100%|██████████| 536914/536914 [02:02<00:00, 4391.74 examples/s]


Setting dataset format to PyTorch tensors...
Labels type: <class 'torch.Tensor'>
Labels dtype: torch.float32
Sample labels: tensor([1., 0., 0., 0., 0., 0., 0., 1.])


In [7]:
# --- 6. Train/Test Split ---
print("Splitting dataset...")
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Splitting dataset...
Train dataset size: 429531
Eval dataset size: 107383


In [8]:
# --- 7. Model Loading ---
print("Loading model...")
num_labels = len(label_columns)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    problem_type="multi_label_classification",
    num_labels=num_labels
)

Loading model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# --- 8. Metrics Function ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    # Convert to probabilities
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))
    
    # Convert to predictions (threshold = 0.5)
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1
    
    # Calculate metrics
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    roc_auc = roc_auc_score(labels, predictions, average='micro')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

model_dir = "./fine-tuned-scibert-multilabel"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

trainer = Trainer(model=model, tokenizer=tokenizer)

# # Run evaluation again
# results = trainer.evaluate(eval_dataset)
# print(results)


In [13]:
preds = trainer.predict(eval_dataset)
print(preds.metrics)


{'test_loss': 0.09524441510438919, 'test_runtime': 3475.7635, 'test_samples_per_second': 30.895, 'test_steps_per_second': 3.862}
