In [1]:
import json
import pandas as pd
from collections import Counter
import os

# Step 1: Load the arXiv dataset
input_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
data = []
print("Loading arXiv JSON lines dataset...")
with open(input_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
print(f"Total papers loaded: {len(data)}")

# Step 2: Convert to DataFrame
df = pd.DataFrame(data)
print("Available columns:")
print(df.columns.tolist())

# Step 3: Count papers per category
category_counts = Counter()
for cats in df['categories']:
    for cat in cats.split():
        category_counts[cat] += 1

category_df = pd.DataFrame(category_counts.items(), columns=['Category', 'Count'])
category_df = category_df.sort_values('Count', ascending=False)

# Step 4: Filter categories with >5,000 papers
popular_categories = category_df[category_df['Count'] > 5000]['Category'].tolist()
print(f"\nCategories with > 5,000 papers: {len(popular_categories)}")
print(category_df[category_df['Category'].isin(popular_categories)].head(20))

# Step 5: Filter papers that have at least one popular category
def has_popular_category(category_str):
    return any(cat in popular_categories for cat in category_str.split())

filtered_df = df[df['categories'].apply(has_popular_category)]
print(f"\nFiltered papers: {len(filtered_df)}")

# Step 6: Keep only relevant columns and enrich text
filtered_df = filtered_df[['title', 'abstract', 'categories']]
filtered_df = filtered_df.dropna()
filtered_df['title_abstract'] = filtered_df['title'] + " " + filtered_df['abstract']

# Step 7: Save outputs
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)

filtered_csv_path = os.path.join(output_dir, 'filtered_arxiv_papers.csv')
stats_csv_path = os.path.join(output_dir, 'category_statistics.csv')

filtered_df.to_csv(filtered_csv_path, index=False)
category_df.to_csv(stats_csv_path, index=False)

# Step 8: Print summary
print("\n Processing Complete")
print(f"Original total papers: {len(df)}")
print(f"Filtered papers: {len(filtered_df)}")
print(f"Filtered dataset saved to: {filtered_csv_path}")
print(f"Category statistics saved to: {stats_csv_path}")
print("\nSample rows:")
print(filtered_df.head())

Loading arXiv JSON lines dataset...
Total papers loaded: 2725401
Available columns:
['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']

Categories with > 5,000 papers: 128
               Category   Count
96                cs.LG  215117
0                hep-ph  187176
13               hep-th  173367
27             quant-ph  161245
114               cs.CV  153832
42                cs.AI  124171
7                 gr-qc  113340
9              astro-ph  105380
8     cond-mat.mtrl-sci   99338
6     cond-mat.mes-hall   95166
34              math.MP   83839
33              math-ph   83839
126               cs.CL   82766
20      cond-mat.str-el   77579
21   cond-mat.stat-mech   76742
136         astro-ph.CO   71425
1               math.CO   71057
110             stat.ML   70559
144         astro-ph.GA   69725
66              math.AP   67040

Filtered papers: 2697904

 Processing

In [2]:
import pandas as pd
import os
import csv

# File paths
file_path = '/kaggle/working/filtered_arxiv_papers.csv'
stats_path = '/kaggle/working/category_statistics.csv'

# Check file
print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load dataset
print("Loading filtered_arxiv_papers.csv...")
df = pd.read_csv(file_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

# Enrich text
if 'title_abstract' not in df.columns:
    print("Enriching text column with title + abstract...")
    df['title_abstract'] = df['title'].astype(str) + " " + df['abstract'].astype(str)

# Load and filter category stats for math.* only
print("Loading category statistics...")
stats_df = pd.read_csv(stats_path)
stats_df = stats_df[stats_df['Category'].str.startswith('math.')]
stats_df = stats_df.sort_values('Count', ascending=False)

top_categories = stats_df['Category'].tolist()
print(f"\nUsing ALL {len(top_categories)} math categories:")
print(top_categories)

Checking for /kaggle/working/filtered_arxiv_papers.csv...
Loading filtered_arxiv_papers.csv...
Loaded 2697904 rows
Columns: ['title', 'abstract', 'categories', 'title_abstract']
Loading category statistics...

Using ALL 32 math categories:
['math.MP', 'math.CO', 'math.AP', 'math.PR', 'math.AG', 'math.OC', 'math.IT', 'math.NT', 'math.DG', 'math.NA', 'math.DS', 'math.FA', 'math.RT', 'math.ST', 'math.GT', 'math.GR', 'math.CA', 'math.QA', 'math.RA', 'math.CV', 'math.AT', 'math.LO', 'math.AC', 'math.OA', 'math.MG', 'math.SP', 'math.SG', 'math.CT', 'math.KT', 'math.GN', 'math.GM', 'math.HO']


In [3]:
# Map labels
label_map = {cat: idx for idx, cat in enumerate(top_categories)}

# Extract primary math category
def extract_primary_category(cat_str):
    if pd.isna(cat_str):
        return None
    for cat in cat_str.split():
        if cat.startswith('math.') and cat in label_map:
            return cat
    return None

# Apply extraction
print("Mapping categories to labels...")
df['category'] = df['categories'].apply(extract_primary_category)
df['label'] = df['category'].map(label_map)

# Filter valid rows
df = df[df['label'].notnull()].copy()
print(df)

Mapping categories to labels...
                                                     title  \
1                 Sparsity-certifying Graph Decompositions   
3        A determinant of Stirling cycle numbers counts...   
4        From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   
9        Partial cubes: structures, characterizations, ...   
10       Computing genus 2 Hilbert-Siegel modular forms...   
...                                                    ...   
2697800  Yang-Baxter Algebra for the n-Harmonic Oscilla...   
2697813  Integrable deformations of oscillator chains f...   
2697814  A note on real forms of the complex N=4 supers...   
2697828  Real forms of the complex twisted N=2 supersym...   
2697833  Vector NLS hierarchy solitons revisited: dress...   

                                                  abstract  \
1          We describe a new algorithm, the $(k,\ell)$-...   
3          We show that a determinant of Stirling cycle...   
4          In this paper we show how 

In [4]:
import pandas as pd
import os
import csv

# File paths
file_path = '/kaggle/working/filtered_arxiv_papers.csv'
stats_path = '/kaggle/working/category_statistics.csv'
output_path = '/kaggle/working/math_dataset_labeled.csv'
label_map_path = '/kaggle/working/label_map.csv'

# Check file existence
print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load the dataset
print("Loading filtered_arxiv_papers.csv...")
df = pd.read_csv(file_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

# Enrich title_abstract if not present
if 'title_abstract' not in df.columns:
    print("Creating 'title_abstract' from title + abstract...")
    df['title'] = df['title'].fillna('')
    df['abstract'] = df['abstract'].fillna('')
    df['title_abstract'] = df['title'] + " " + df['abstract']

# Load category stats and filter math.* categories
print("Loading and filtering category statistics...")
stats_df = pd.read_csv(stats_path)
stats_df = stats_df[stats_df['Category'].str.startswith('math.')]
stats_df = stats_df.sort_values('Count', ascending=False)

top_categories = stats_df['Category'].tolist()
print(f"\nUsing ALL {len(top_categories)} math categories:")
print(top_categories)

# Create label mapping
label_map = {cat: idx for idx, cat in enumerate(top_categories)}

# Function to extract primary math category
def extract_primary_category(cat_str):
    if pd.isna(cat_str):
        return None
    for cat in cat_str.split():
        if cat.startswith('math.') and cat in label_map:
            return cat
    return None

# Apply category extraction and label mapping
print("Mapping categories to labels...")
df['category'] = df['categories'].apply(extract_primary_category)
df['label'] = df['category'].map(label_map)

# Filter out rows without math category
df = df[df['label'].notnull()].copy()

# Save label map to CSV
pd.Series(label_map).to_csv(label_map_path, header=['Label'], index_label='Category')
print(f"Saved label mapping to {label_map_path}")

# Drop unneeded columns
df = df.drop(columns=['title','abstract'])

# Save final labeled dataset
df.to_csv(output_path, index=False)
print(f"Saved cleaned dataset to {output_path}")

# Reload and print category distribution
print("\n--- Category Distribution ---")
df_loaded = pd.read_csv(output_path)
category_counts = df_loaded['category'].value_counts()

for category, count in category_counts.items():
    label = df_loaded[df_loaded['category'] == category]['label'].iloc[0]
    print(f"Category: {category} | Label: {label} | Papers: {count}")

print("\nTotal unique categories:", df_loaded['category'].nunique())
print("Total number of papers:", len(df_loaded))

Checking for /kaggle/working/filtered_arxiv_papers.csv...
Loading filtered_arxiv_papers.csv...
Loaded 2697904 rows
Columns: ['title', 'abstract', 'categories', 'title_abstract']
Loading and filtering category statistics...

Using ALL 32 math categories:
['math.MP', 'math.CO', 'math.AP', 'math.PR', 'math.AG', 'math.OC', 'math.IT', 'math.NT', 'math.DG', 'math.NA', 'math.DS', 'math.FA', 'math.RT', 'math.ST', 'math.GT', 'math.GR', 'math.CA', 'math.QA', 'math.RA', 'math.CV', 'math.AT', 'math.LO', 'math.AC', 'math.OA', 'math.MG', 'math.SP', 'math.SG', 'math.CT', 'math.KT', 'math.GN', 'math.GM', 'math.HO']
Mapping categories to labels...
Saved label mapping to /kaggle/working/label_map.csv
Saved cleaned dataset to /kaggle/working/math_dataset_labeled.csv

--- Category Distribution ---
Category: math.AP | Label: 2.0 | Papers: 56030
Category: math.CO | Label: 1.0 | Papers: 55123
Category: math.MP | Label: 0.0 | Papers: 51286
Category: math.OC | Label: 5.0 | Papers: 46708
Category: math.IT | Lab

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load the dataset
df = pd.read_csv('/kaggle/working/math_dataset_labeled.csv')

# Check the columns to ensure proper structure
print(df.columns)

# Balance dataset
rows_per_category = 3000
print(f"\nBalancing: {rows_per_category} rows per category...")

# Placeholder for filtered data and skipped categories

filtered_list = []
skipped = []

# Get the top categories (those that have the most papers)
top_categories = df['category'].value_counts().index.tolist()

# Iterate over the top categories to balance
for cat in top_categories:
    cat_df = df[df['category'] == cat]
    available = len(cat_df)
    if available >= rows_per_category:
        filtered_list.append(cat_df.sample(n=rows_per_category, random_state=42))
    else:
        skipped.append((cat, available))
        print(f"⚠ Skipping {cat}: only {available} papers (needs ≥ {rows_per_category})")

# Combine all balanced samples into a new DataFrame
if not filtered_list:
    raise ValueError("No categories had enough papers to sample from!")

balanced_df = pd.concat(filtered_list).reset_index(drop=True)

# Print summary of balancing
print(f"\nFinal balanced dataset size: {len(balanced_df)}")
print(f"Included categories: {balanced_df['category'].nunique()}")
print(f"Skipped categories: {len(skipped)}")

# Update label map for actually used categories
used_categories = sorted(balanced_df['category'].unique())
label_map = {cat: i for i, cat in enumerate(used_categories)}
balanced_df['label'] = balanced_df['category'].map(label_map)

# Report label distribution
print("\nLabel mapping:")
for cat, idx in label_map.items():
    count = (balanced_df['category'] == cat).sum()
    print(f" Category: {cat:<10} | Label: {idx:<2} | Papers: {count}")

print(f"\nTotal categories used: {len(label_map)}")
print(f"Total papers: {len(balanced_df)}")

# Split into train/val/test (80/10/10)
print("\nSplitting dataset...")
train_df, temp_df = train_test_split(
    balanced_df, test_size=0.2, stratify=balanced_df['label'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
)

# Save the splits
output_dir = '/kaggle/working/'
os.makedirs(output_dir, exist_ok=True)

train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

# Final summary
print(f"\nSaved train/val/test splits:")
print(f" Train size: {len(train_df)}")
print(f" Val size:   {len(val_df)}")
print(f" Test size:  {len(test_df)}")


Index(['categories', 'title_abstract', 'category', 'label'], dtype='object')

Balancing: 3000 rows per category...
⚠ Skipping math.KT: only 2624 papers (needs ≥ 3000)
⚠ Skipping math.HO: only 1430 papers (needs ≥ 3000)
⚠ Skipping math.GM: only 287 papers (needs ≥ 3000)

Final balanced dataset size: 87000
Included categories: 29
Skipped categories: 3

Label mapping:
 Category: math.AC    | Label: 0  | Papers: 3000
 Category: math.AG    | Label: 1  | Papers: 3000
 Category: math.AP    | Label: 2  | Papers: 3000
 Category: math.AT    | Label: 3  | Papers: 3000
 Category: math.CA    | Label: 4  | Papers: 3000
 Category: math.CO    | Label: 5  | Papers: 3000
 Category: math.CT    | Label: 6  | Papers: 3000
 Category: math.CV    | Label: 7  | Papers: 3000
 Category: math.DG    | Label: 8  | Papers: 3000
 Category: math.DS    | Label: 9  | Papers: 3000
 Category: math.FA    | Label: 10 | Papers: 3000
 Category: math.GN    | Label: 11 | Papers: 3000
 Category: math.GR    | Label: 12 | Papers: 

In [6]:
# Step 2: Tokenize dataset for Kaggle
import pandas as pd
from transformers import LongformerTokenizer
import torch
import pickle
import logging
import os

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/step2_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define input/output directories
input_dir = '/kaggle/working/'  # Kaggle uploaded files are here
output_dir = '/kaggle/working/'  # Save outputs here

# Find train.csv and val.csv automatically
print("Searching for train.csv and val.csv in /kaggle/input/...")
logger.info("Searching for train.csv and val.csv in /kaggle/input/...")
train_path = None
val_path = None

for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file == 'train.csv':
            train_path = os.path.join(root, file)
        if file == 'val.csv':
            val_path = os.path.join(root, file)
    if train_path and val_path:
        break

if not train_path or not val_path:
    logger.error("train.csv or val.csv not found in /kaggle/input/.")
    raise FileNotFoundError("train.csv or val.csv not found. Please upload them.")

print(f"Found train.csv at {train_path}")
print(f"Found val.csv at {val_path}")
logger.info(f"Found train.csv at {train_path}")
logger.info(f"Found val.csv at {val_path}")

# Load tokenizer
print("Loading Longformer tokenizer...")
logger.info("Loading Longformer tokenizer...")
try:
    tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
except Exception as e:
    logger.error(f"Error loading tokenizer: {e}")
    raise

# Load datasets
print("Loading datasets...")
logger.info("Loading datasets...")
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Tokenization function
def tokenize_data(df, max_length=4096):
    texts = df['title_abstract'].tolist()  # Assuming 'text' column
    labels = df['label'].tolist()
    encodings = tokenizer(
        texts, 
        truncation=True, 
        padding=True, 
        max_length=max_length, 
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }

# Tokenize in batches
batch_size = 100
train_tokenized = []
val_tokenized = []

print("Tokenizing training data...")
logger.info("Tokenizing training data...")
for i in range(0, len(train_df), batch_size):
    batch_df = train_df[i:i+batch_size]
    tokenized_batch = tokenize_data(batch_df)
    train_tokenized.append(tokenized_batch)
    print(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")
    logger.info(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")

print("Tokenizing validation data...")
logger.info("Tokenizing validation data...")
for i in range(0, len(val_df), batch_size):
    batch_df = val_df[i:i+batch_size]
    tokenized_batch = tokenize_data(batch_df)
    val_tokenized.append(tokenized_batch)
    print(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")
    logger.info(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")

# Save tokenized datasets
print(f"Saving tokenized datasets to {output_dir}...")
logger.info(f"Saving tokenized datasets to {output_dir}...")
with open(os.path.join(output_dir, 'train_tokenized.pkl'), 'wb') as f:
    pickle.dump(train_tokenized, f)
with open(os.path.join(output_dir, 'val_tokenized.pkl'), 'wb') as f:
    pickle.dump(val_tokenized, f)

print(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
logger.info(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
print("Step 2 complete.")
logger.info("Step 2 complete.")


Searching for train.csv and val.csv in /kaggle/input/...
Found train.csv at /kaggle/working/train.csv
Found val.csv at /kaggle/working/val.csv
Loading Longformer tokenizer...


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Loading datasets...
Tokenizing training data...
Tokenized train batch 1/697
Tokenized train batch 2/697
Tokenized train batch 3/697
Tokenized train batch 4/697
Tokenized train batch 5/697
Tokenized train batch 6/697
Tokenized train batch 7/697
Tokenized train batch 8/697
Tokenized train batch 9/697
Tokenized train batch 10/697
Tokenized train batch 11/697
Tokenized train batch 12/697
Tokenized train batch 13/697
Tokenized train batch 14/697
Tokenized train batch 15/697
Tokenized train batch 16/697
Tokenized train batch 17/697
Tokenized train batch 18/697
Tokenized train batch 19/697
Tokenized train batch 20/697
Tokenized train batch 21/697
Tokenized train batch 22/697
Tokenized train batch 23/697
Tokenized train batch 24/697
Tokenized train batch 25/697
Tokenized train batch 26/697
Tokenized train batch 27/697
Tokenized train batch 28/697
Tokenized train batch 29/697
Tokenized train batch 30/697
Tokenized train batch 31/697
Tokenized train batch 32/697
Tokenized train batch 33/697
Toke

In [11]:
import pandas as pd
import numpy as np
import os
import pickle
import torch
import gc
import glob
import re
import shutil
import logging
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# Setup
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/training_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# File paths
input_dir = '/kaggle/working/'
train_tokenized_path = os.path.join(input_dir, 'train_tokenized.pkl')
val_tokenized_path = os.path.join(input_dir, 'val_tokenized.pkl')
results_dir = os.path.join(input_dir, 'results')
os.makedirs(results_dir, exist_ok=True)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load tokenized data
with open(train_tokenized_path, 'rb') as f:
    train_tokenized = pickle.load(f)
with open(val_tokenized_path, 'rb') as f:
    val_tokenized = pickle.load(f)

# Convert tokenized data to Huggingface Dataset
def convert_to_dataset(tokenized_data):
    input_ids, attention_mask, labels = [], [], []
    for batch in tokenized_data:
        input_ids.extend(batch['input_ids'])
        attention_mask.extend(batch['attention_mask'])
        batch_labels = batch['labels'].tolist() if isinstance(batch['labels'], torch.Tensor) else batch['labels']
        labels.extend(batch_labels)
    dataset = Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    })
    return dataset.map(lambda x: {'labels': int(x['labels'])})

# Prepare datasets
train_dataset = convert_to_dataset(train_tokenized)
val_dataset = convert_to_dataset(val_tokenized)
del train_tokenized, val_tokenized
gc.collect()
torch.cuda.empty_cache()

# Class weights
train_labels = np.array(train_dataset['labels'])
num_labels = len(np.unique(train_labels))
class_weights_tensor = torch.tensor(
    compute_class_weight(class_weight='balanced', classes=np.arange(num_labels), y=train_labels),
    dtype=torch.float
)

# Load model
checkpoint_dirs = glob.glob(os.path.join(results_dir, 'checkpoint-*'))
checkpoint_path = max(checkpoint_dirs, key=os.path.getmtime) if checkpoint_dirs else None
model = BertForSequenceClassification.from_pretrained(
    checkpoint_path if checkpoint_path else 'bert-base-uncased',
    num_labels=num_labels
)
model.to(torch.device('cuda'))

# Custom trainer with class weights
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=os.path.join(input_dir, 'logs'),
    logging_steps=10,
    save_steps=15000,
    eval_strategy='epoch',
    eval_steps='epoch',
    save_total_limit=2,
    fp16=True,
    report_to='none'
)

# Train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {'accuracy': (np.argmax(pred.predictions, axis=1) == pred.label_ids).mean()},
    data_collator=DataCollatorWithPadding(tokenizer),
    class_weights=class_weights_tensor
)

logger.info("Starting training...")
trainer.train(resume_from_checkpoint=checkpoint_path if checkpoint_path else None)

# Save final model
final_model_path = os.path.join(input_dir, 'final_model')
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Evaluate
metrics = trainer.evaluate()
pd.DataFrame([metrics]).to_csv(os.path.join(input_dir, "final_eval_metrics.csv"), index=False)
logger.info(f"Training completed. Model saved to {final_model_path}. Metrics saved.")


loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,


Map:   0%|          | 0/69600 [00:00<?, ? examples/s]

Map:   0%|          | 0/8700 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
