In [1]:
import json
import pandas as pd
from collections import Counter
import random
import os

# Step 1: Set the correct path to the JSON file in the Kaggle input section
json_file_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'  # This is the file path to arxiv-metadata-oai-snapshot.json

# Verify the file exists
if not os.path.exists(json_file_path):
    print(f"[PRINT] JSON file not found at {json_file_path}")
    # List the contents of the parent directory to help debug
    parent_dir = '/kaggle/input'
    print(f"[PRINT] Available datasets in {parent_dir}: {os.listdir(parent_dir)}")
    raise FileNotFoundError(f"JSON file not found at {json_file_path}")

# Step 2: Load the JSON file
print("[PRINT] Loading JSON file")
try:
    with open(json_file_path, 'r') as f:
        # Assuming JSONL format (one JSON object per line)
        data = [json.loads(line) for line in f]
    print(f"[PRINT] Loaded {len(data)} papers from JSON file")
except Exception as e:
    print(f"[PRINT] Error loading JSON file: {e}")
    raise

# Step 3: Filter for Computer Science (cs) papers and extract primary category
cs_papers = []
for paper in data:
    if 'categories' not in paper or 'title' not in paper or 'abstract' not in paper:
        continue
    # Split categories (e.g., "cs.CL math.ST" -> ["cs.CL", "math.ST"])
    categories = paper['categories'].split()
    # Consider the primary category (first one)
    primary_category = categories[0]
    # Check if the primary category starts with "cs."
    if primary_category.startswith('cs.'):
        paper['primary_category'] = primary_category
        cs_papers.append(paper)

print(f"[PRINT] Found {len(cs_papers)} papers in Computer Science (cs) category")

# Step 4: Identify the top 10 cs subcategories
# Count papers per primary category
category_counts = Counter(paper['primary_category'] for paper in cs_papers)
print("[PRINT] Category distribution (subcategory: count):")
for category, count in category_counts.most_common():
    print(f"{category}: {count}")

# Select the top 10 subcategories
top_10_categories = [category for category, count in category_counts.most_common(10)]
print(f"\n[PRINT] Top 10 cs subcategories: {top_10_categories}")

# Step 5: Sample 3,000 papers per top 10 subcategory
papers_by_category = {category: [] for category in top_10_categories}
for paper in cs_papers:
    if paper['primary_category'] in top_10_categories:
        papers_by_category[paper['primary_category']].append(paper)

# Sample 3,000 papers from each category (or all if fewer than 3,000)
selected_papers = []
for category in top_10_categories:
    papers = papers_by_category[category]
    if len(papers) >= 3000:
        sampled_papers = random.sample(papers, 3000)
    else:
        sampled_papers = papers  # Take all if fewer than 3,000
    print(f"[PRINT] Sampled {len(sampled_papers)} papers for category {category}")
    selected_papers.extend(sampled_papers)

# Step 6: Create a DataFrame with combined title and abstract
df_data = []
for paper in selected_papers:
    title = paper.get('title', '').replace('\n', ' ').strip()
    abstract = paper.get('abstract', '').replace('\n', ' ').strip()
    text = f"{title} {abstract}".strip()
    df_data.append({
        'text': text,
        'category': paper['primary_category'],
        'label': top_10_categories.index(paper['primary_category'])  # Assign label (0-9)
    })

df = pd.DataFrame(df_data)
print(f"[PRINT] Created DataFrame with {len(df)} papers")
print(f"[PRINT] Columns in DataFrame: {list(df.columns)}")
print("\n[PRINT] First 5 rows of DataFrame:")
print(df.head())

# Verify the distribution
print("\n[PRINT] Final distribution (category: count):")
print(df['category'].value_counts())

# Step 7: Save the dataset to a CSV file
output_csv_path = '/kaggle/working/cs_top10_3000_each.csv'
df.to_csv(output_csv_path, index=False)
print(f"[PRINT] Dataset saved to {output_csv_path}")

[PRINT] Loading JSON file
[PRINT] Loaded 2730173 papers from JSON file
[PRINT] Found 616771 papers in Computer Science (cs) category
[PRINT] Category distribution (subcategory: count):
cs.CV: 118416
cs.LG: 104062
cs.CL: 62762
cs.IT: 37070
cs.RO: 29800
cs.AI: 26490
cs.CR: 26054
cs.NI: 17193
cs.DS: 16497
cs.SE: 15887
cs.DC: 14644
cs.HC: 14096
cs.CY: 12269
cs.IR: 11968
cs.LO: 10934
cs.SI: 10785
cs.GT: 8158
cs.SD: 7567
cs.NE: 7325
cs.DB: 6662
cs.CC: 5394
cs.PL: 5211
cs.SY: 4963
cs.DM: 4821
cs.CE: 4489
cs.CG: 4445
cs.AR: 3801
cs.DL: 3646
cs.MA: 3103
cs.FL: 3061
cs.GR: 3000
cs.MM: 2407
cs.ET: 2363
cs.OH: 2075
cs.SC: 1319
cs.PF: 1224
cs.MS: 1089
cs.NA: 1057
cs.OS: 549
cs.GL: 115

[PRINT] Top 10 cs subcategories: ['cs.CV', 'cs.LG', 'cs.CL', 'cs.IT', 'cs.RO', 'cs.AI', 'cs.CR', 'cs.NI', 'cs.DS', 'cs.SE']
[PRINT] Sampled 3000 papers for category cs.CV
[PRINT] Sampled 3000 papers for category cs.LG
[PRINT] Sampled 3000 papers for category cs.CL
[PRINT] Sampled 3000 papers for category cs.IT
[PRINT

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# File path for the dataset
file_path = '/kaggle/working/cs_top10_3000_each.csv'

print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}. Please ensure cs_top10_3000_each.csv exists.")

# Load dataset
print("Loading cs_top10_3000_each.csv...")
try:
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"Error loading cs_top10_3000_each.csv: {e}")
    raise

# Verify the dataset
print("\n[PRINT] Category distribution (before split):")
print(df['category'].value_counts())

# Split dataset while maintaining balance across categories
print("Splitting dataset...")
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,  # 20% for val + test
    stratify=df['label'],  # Stratify by label to maintain balance
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # Split the 20% into 10% val and 10% test
    stratify=temp_df['label'],
    random_state=42
)

# Verify the splits
print(f"\nTrain size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")
print("\n[PRINT] Train set distribution (category: count):")
print(train_df['category'].value_counts())
print("\n[PRINT] Validation set distribution (category: count):")
print(val_df['category'].value_counts())
print("\n[PRINT] Test set distribution (category: count):")
print(test_df['category'].value_counts())

# Save splits
output_dir = '/kaggle/working/'
print(f"Saving train.csv, val.csv, test.csv to {output_dir}...")
train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

print("Step 1 complete.")

Checking for /kaggle/working/cs_top10_3000_each.csv...
Loading cs_top10_3000_each.csv...
Loaded 30000 rows
Columns: ['text', 'category', 'label']

[PRINT] Category distribution (before split):
category
cs.CV    3000
cs.LG    3000
cs.CL    3000
cs.IT    3000
cs.RO    3000
cs.AI    3000
cs.CR    3000
cs.NI    3000
cs.DS    3000
cs.SE    3000
Name: count, dtype: int64
Splitting dataset...

Train size: 24000, Val size: 3000, Test size: 3000

[PRINT] Train set distribution (category: count):
category
cs.CR    2400
cs.AI    2400
cs.CL    2400
cs.DS    2400
cs.RO    2400
cs.LG    2400
cs.NI    2400
cs.IT    2400
cs.SE    2400
cs.CV    2400
Name: count, dtype: int64

[PRINT] Validation set distribution (category: count):
category
cs.DS    300
cs.NI    300
cs.CR    300
cs.RO    300
cs.AI    300
cs.CV    300
cs.IT    300
cs.LG    300
cs.CL    300
cs.SE    300
Name: count, dtype: int64

[PRINT] Test set distribution (category: count):
category
cs.IT    300
cs.RO    300
cs.DS    300
cs.AI    300
c

In [3]:
import pandas as pd
from transformers import LongformerTokenizerFast
import torch
import pickle
import logging
import os

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/step2_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)  # Fixed _name_ to __name__

# Define input/output directories
input_dir = '/kaggle/working/'
output_dir = '/kaggle/working/'

# Verify train.csv and val.csv exist
train_path = os.path.join(input_dir, 'train.csv')
val_path = os.path.join(input_dir, 'val.csv')

print(f"Checking for train.csv at {train_path}...")
print(f"Checking for val.csv at {val_path}...")
if not os.path.exists(train_path) or not os.path.exists(val_path):
    logger.error("train.csv or val.csv not found in /kaggle/working/.")
    raise FileNotFoundError("train.csv or val.csv not found. Please run Step 1 first.")

# Load datasets
print("Loading datasets...")
logger.info("Loading datasets...")
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
print(f"Loaded {len(train_df)} rows in train set")
print(f"Loaded {len(val_df)} rows in val set")

# Load tokenizer
print("Loading Longformer tokenizer...")
logger.info("Loading Longformer tokenizer...")
try:
    tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
    tokenizer.model_max_length = 1024  # Adjust to match your previous setup
except Exception as e:
    logger.error(f"Error loading tokenizer: {e}")
    raise

# Tokenization function
def tokenize_data(df, max_length=1024):  # Adjusted max_length to 1024
    texts = df['text'].tolist()
    labels = df['label'].tolist()
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    }

# Tokenize in batches
batch_size = 100
train_tokenized = []
val_tokenized = []

print("Tokenizing training data...")
logger.info("Tokenizing training data...")
for i in range(0, len(train_df), batch_size):
    batch_df = train_df[i:i + batch_size]
    tokenized_batch = tokenize_data(batch_df)
    train_tokenized.append(tokenized_batch)
    print(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")
    logger.info(f"Tokenized train batch {i//batch_size + 1}/{len(train_df)//batch_size + 1}")

print("Tokenizing validation data...")
logger.info("Tokenizing validation data...")
for i in range(0, len(val_df), batch_size):
    batch_df = val_df[i:i + batch_size]
    tokenized_batch = tokenize_data(batch_df)
    val_tokenized.append(tokenized_batch)
    print(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")
    logger.info(f"Tokenized val batch {i//batch_size + 1}/{len(val_df)//batch_size + 1}")

# Save tokenized datasets
print(f"Saving tokenized datasets to {output_dir}...")
logger.info(f"Saving tokenized datasets to {output_dir}...")
with open(os.path.join(output_dir, 'train_tokenized.pkl'), 'wb') as f:
    pickle.dump(train_tokenized, f)
with open(os.path.join(output_dir, 'val_tokenized.pkl'), 'wb') as f:
    pickle.dump(val_tokenized, f)

print(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
logger.info(f"Train tokenized: {len(train_tokenized)} batches, Val tokenized: {len(val_tokenized)} batches")
print("Step 2 complete.")
logger.info("Step 2 complete.")

Checking for train.csv at /kaggle/working/train.csv...
Checking for val.csv at /kaggle/working/val.csv...
Loading datasets...
Loaded 24000 rows in train set
Loaded 3000 rows in val set
Loading Longformer tokenizer...


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Tokenizing training data...
Tokenized train batch 1/241
Tokenized train batch 2/241
Tokenized train batch 3/241
Tokenized train batch 4/241
Tokenized train batch 5/241
Tokenized train batch 6/241
Tokenized train batch 7/241
Tokenized train batch 8/241
Tokenized train batch 9/241
Tokenized train batch 10/241
Tokenized train batch 11/241
Tokenized train batch 12/241
Tokenized train batch 13/241
Tokenized train batch 14/241
Tokenized train batch 15/241
Tokenized train batch 16/241
Tokenized train batch 17/241
Tokenized train batch 18/241
Tokenized train batch 19/241
Tokenized train batch 20/241
Tokenized train batch 21/241
Tokenized train batch 22/241
Tokenized train batch 23/241
Tokenized train batch 24/241
Tokenized train batch 25/241
Tokenized train batch 26/241
Tokenized train batch 27/241
Tokenized train batch 28/241
Tokenized train batch 29/241
Tokenized train batch 30/241
Tokenized train batch 31/241
Tokenized train batch 32/241
Tokenized train batch 33/241
Tokenized train batch 34

In [4]:
### import os
import glob
import re
import torch
import numpy as np
import pickle
import gc
import logging
import time
import pandas as pd
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support
from transformers import (
    LongformerForSequenceClassification,
    LongformerTokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# Setup
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/training_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

input_dir = '/kaggle/working/'
train_tokenized_path = os.path.join(input_dir, 'train_tokenized.pkl')
val_tokenized_path = os.path.join(input_dir, 'val_tokenized.pkl')
results_dir = os.path.join(input_dir, 'results')
os.makedirs(results_dir, exist_ok=True)

# Checkpoint
def get_latest_checkpoint(results_dir):
    checkpoint_dirs = glob.glob(os.path.join(results_dir, 'checkpoint-*'))
    if not checkpoint_dirs:
        return None
    checkpoint_nums = [int(re.search(r'checkpoint-(\d+)', d).group(1)) for d in checkpoint_dirs]
    return os.path.join(results_dir, f'checkpoint-{max(checkpoint_nums)}')

checkpoint_path = get_latest_checkpoint(results_dir)
print(f"Checkpoint: {checkpoint_path}" if checkpoint_path else "No checkpoints found.")

# Load Data
with open(train_tokenized_path, 'rb') as f:
    train_tokenized = pickle.load(f)
with open(val_tokenized_path, 'rb') as f:
    val_tokenized = pickle.load(f)

def flatten_batches(batched_data):
    """Flatten list of batches into a flat list of dicts with same length input tensors."""
    flat_data = []
    for batch in batched_data:
        for i in range(len(batch['input_ids'])):
            item = {
                'input_ids': batch['input_ids'][i][:1024],
                'attention_mask': batch['attention_mask'][i][:1024],
                'labels': int(batch['labels'][i])
            }
            flat_data.append(item)
    return flat_data

train_dataset = Dataset.from_list(flatten_batches(train_tokenized))
val_dataset = Dataset.from_list(flatten_batches(val_tokenized))

# Inspect the data
print("[PRINT] Sample from train_dataset:")
for i in range(3):
    sample = train_dataset[i]
    text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
    label = sample['labels']
    category = ['cs.CV', 'cs.LG', 'cs.CL', 'cs.IT', 'cs.RO', 'cs.AI', 'cs.CR', 'cs.NI', 'cs.DS', 'cs.SE'][label]
    print(f"Sample {i+1}: Label={label} ({category}), Text (first 100 chars): {text[:100]}...")

print("\n[PRINT] Label distribution in train_dataset:")
train_labels = [sample['labels'] for sample in train_dataset]
print(pd.Series(train_labels).value_counts())

print("\n[PRINT] Label distribution in val_dataset:")
val_labels = [sample['labels'] for sample in val_dataset]
print(pd.Series(val_labels).value_counts())

# Analyze token lengths
print("\n[PRINT] Analyzing token lengths...")
token_lengths = [sum(sample['attention_mask']) for sample in train_dataset]
print(f"Token length stats: Mean={np.mean(token_lengths):.1f}, Median={np.median(token_lengths):.1f}, Max={max(token_lengths)}, Min={min(token_lengths)}")
print(f"Percentage of samples truncated (length >= 1024): {100 * sum(1 for length in token_lengths if length >= 1024) / len(token_lengths):.2f}%")

del train_tokenized, val_tokenized
gc.collect()
torch.cuda.empty_cache()

# Tokenizer & Model
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer.model_max_length = 1024

if checkpoint_path:
    model = LongformerForSequenceClassification.from_pretrained(
        checkpoint_path, num_labels=10, ignore_mismatched_sizes=True
    )
else:
    model = LongformerForSequenceClassification.from_pretrained(
        'allenai/longformer-base-4096', num_labels=10
    )

model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': (preds == labels).mean(),
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Increased for effective batch size of 16
    warmup_steps=200,  # Reduced warmup steps
    learning_rate=5e-5,  # Lowered learning rate
    weight_decay=0.01,
    max_grad_norm=1.0,  # Added gradient clipping
    logging_dir=os.path.join(input_dir, 'logs'),
    logging_steps=10,
    logging_first_step=True,
    eval_strategy='steps',  # Evaluate more frequently
    eval_steps=500,
    save_strategy='epoch',
    load_best_model_at_end=False,
    fp16=True,
    report_to='none',
    log_level="info",
    disable_tqdm=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer, padding=True),
)

# Training
print("Starting training...")
start_time = time.time()
trainer.train(resume_from_checkpoint=checkpoint_path)
end_time = time.time()

# Save Final Model
final_model_path = os.path.join(input_dir, 'final_model')
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")
print(f"Training completed in {(end_time - start_time)/60:.2f} minutes.")

# Save Final Evaluation Metrics
metrics = trainer.evaluate()
pd.DataFrame([metrics]).to_csv(os.path.join(input_dir, "final_eval_metrics.csv"), index=False)
print("Metrics saved to final_eval_metrics.csv")

# Cleanup
del model, trainer
torch.cuda.empty_cache()
gc.collect()

2025-05-12 19:42:38.447490: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747078958.649406      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747078958.703359      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


No checkpoints found.
[PRINT] Sample from train_dataset:
Sample 1: Label=6 (cs.CR), Text (first 100 chars): Near-Optimal Blacklisting Many applications involve agents sharing a resource, such as networks or s...
Sample 2: Label=5 (cs.AI), Text (first 100 chars): An Evolutionary Squeaky Wheel Optimisation Approach to Personnel   Scheduling The quest for robust h...
Sample 3: Label=5 (cs.AI), Text (first 100 chars): Exploring Flexible Scenario Generation in Godot Simulator Cyber-physical systems (CPS) combine cyber...

[PRINT] Label distribution in train_dataset:
6    2400
5    2400
2    2400
8    2400
4    2400
1    2400
7    2400
3    2400
9    2400
0    2400
Name: count, dtype: int64

[PRINT] Label distribution in val_dataset:
8    300
7    300
6    300
4    300
5    300
0    300
3    300
1    300
2    300
9    300
Name: count, dtype: int64

[PRINT] Analyzing token lengths...
Token length stats: Mean=244.5, Median=240.0, Max=682, Min=16
Percentage of samples truncated (length >= 1024)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using auto half precision backend
***** Running training *****
  Num examples = 24,000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 4,500
  Number of trainable parameters = 148,667,146
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Starting training...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
500,0.892,0.819998,0.747,0.786221,0.747,0.729812
1000,0.7068,0.677535,0.784667,0.792944,0.784667,0.781199
1500,0.4481,0.589717,0.811,0.81278,0.811,0.806278
2000,0.4214,0.564824,0.818333,0.819993,0.818333,0.815506
2500,0.3441,0.570783,0.818667,0.818882,0.818667,0.814667
3000,0.4265,0.528338,0.825333,0.826319,0.825333,0.823077
3500,0.3112,0.595178,0.829667,0.827963,0.829667,0.826954
4000,0.3337,0.579614,0.828333,0.825809,0.828333,0.8251
4500,0.3177,0.557813,0.834,0.831747,0.834,0.831872



***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4
Saving model checkpoint to /kaggle/working/results/checkpoint-1500
Configuration saved in /kaggle/working/results/checkpoint-1500/config.json
Model weights saved in /kaggle/working/results/checkpoint-1500/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in /kaggle/working/results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /kaggle/working/results/checkpoint-1500/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 4
Saving model checkpoint to /kaggle/working/results/checkpoint-3000
Configurat

Model saved to /kaggle/working/final_model
Training completed in 274.14 minutes.


Metrics saved to final_eval_metrics.csv


5744