In [1]:
import pandas as pd
import csv
import os

# Path to your CSV
file_path = '/kaggle/input/physics/physics_papers.csv'

# Load CSV
print("Loading dataset...")
df = pd.read_csv(file_path, quoting=csv.QUOTE_ALL, on_bad_lines='warn', engine='python')

# Extract physics.* subcategories
physics_prefix = 'physics.'

def extract_physics_subcategories(categories_series):
    physics_subcats = set()
    for entry in categories_series.dropna():
        for cat in entry.split():
            if cat.startswith(physics_prefix):
                physics_subcats.add(cat[len(physics_prefix):])
    return sorted(physics_subcats)

# Apply
print("Extracting physics subcategories...")
physics_subcategories = extract_physics_subcategories(df['categories'])

# Display results
print(f"Found {len(physics_subcategories)} physics subcategories:")
for cat in physics_subcategories:
    print(cat)


Loading dataset...
Extracting physics subcategories...
Found 22 physics subcategories:
acc-ph
ao-ph
app-ph
atm-clus
atom-ph
bio-ph
chem-ph
class-ph
comp-ph
data-an
ed-ph
flu-dyn
gen-ph
geo-ph
hist-ph
ins-det
med-ph
optics
plasm-ph
pop-ph
soc-ph
space-ph


In [2]:
import pandas as pd
import csv
import os
import json

# Path to your CSV
file_path = '/kaggle/input/physics/physics_papers.csv'

# Load CSV
print("Loading dataset...")
df = pd.read_csv(file_path, quoting=csv.QUOTE_ALL, on_bad_lines='warn', engine='python')

# Enrich text column with title + abstract
df['text'] = df['title'].astype(str) + " " + df['abstract'].astype(str)

# Extract physics.* subcategories
physics_prefix = 'physics.'

def extract_physics_subcategories(categories_series):
    physics_subcats = set()
    for entry in categories_series.dropna():
        for cat in entry.split():
            if cat.startswith(physics_prefix):
                physics_subcats.add(cat[len(physics_prefix):])
    return sorted(physics_subcats)

# Apply
print("Extracting physics subcategories...")
physics_subcategories = extract_physics_subcategories(df['categories'])

# Display all subcategories
print(f"Found {len(physics_subcategories)} physics subcategories:")
for cat in physics_subcategories:
    print(cat)

# Extract primary physics category from 'categories' column
def extract_primary_category(cat_string):
    if pd.isna(cat_string):
        return None
    for cat in cat_string.split():
        if cat.startswith(physics_prefix):
            return cat[len(physics_prefix):]
    return None

# Create 'category' column based on the 'categories' column
print("Creating 'category' column...")
df['category'] = df['categories'].apply(extract_primary_category)

# Count categories (no minimum row threshold)
category_counts = df['category'].value_counts()
valid_categories = category_counts.index.tolist()

print(f"\nUsing {len(valid_categories)} physics subcategories:")
print(valid_categories)

# Filter to valid categories only
df = df[df['category'].isin(valid_categories)].copy()

# Assign integer labels
label_map = {cat: i for i, cat in enumerate(valid_categories)}
df['label'] = df['category'].map(label_map)

# Select up to 10,000 rows per category
max_rows_per_category = 10000
print(f"\nSampling up to {max_rows_per_category} rows per subcategory...")
balanced_df = pd.concat([
    df[df['category'] == cat].sample(n=min(len(df[df['category'] == cat]), max_rows_per_category), random_state=42)
    for cat in valid_categories
], ignore_index=True)

print(f"\nFinal dataset size: {len(balanced_df)}")
print("Label distribution:")
print(balanced_df['label'].value_counts())

# Save results
output_dir = '/kaggle/working/'

# Saving the processed dataset
balanced_df.to_csv(os.path.join(output_dir, 'balanced_physics_papers.csv'), index=False)

# Save label mapping
label_map_path = os.path.join(output_dir, 'label_map.json')
with open(label_map_path, 'w') as f:
    json.dump(label_map, f, indent=2)

print(f"\nSaved balanced dataset to: {output_dir}/balanced_physics_papers.csv")
print(f"Saved label map to: {label_map_path}")


Loading dataset...
Extracting physics subcategories...
Found 22 physics subcategories:
acc-ph
ao-ph
app-ph
atm-clus
atom-ph
bio-ph
chem-ph
class-ph
comp-ph
data-an
ed-ph
flu-dyn
gen-ph
geo-ph
hist-ph
ins-det
med-ph
optics
plasm-ph
pop-ph
soc-ph
space-ph
Creating 'category' column...

Using 22 physics subcategories:
['optics', 'flu-dyn', 'soc-ph', 'atom-ph', 'chem-ph', 'ins-det', 'comp-ph', 'app-ph', 'plasm-ph', 'bio-ph', 'gen-ph', 'data-an', 'class-ph', 'acc-ph', 'med-ph', 'ao-ph', 'geo-ph', 'hist-ph', 'space-ph', 'ed-ph', 'atm-clus', 'pop-ph']

Sampling up to 10000 rows per subcategory...

Final dataset size: 165688
Label distribution:
label
0     10000
1     10000
2     10000
3     10000
4     10000
5     10000
6     10000
7     10000
8     10000
9     10000
10     9291
11     7705
12     7077
13     7002
14     6551
15     5760
16     5653
17     4724
18     4601
19     3620
20     1970
21     1734
Name: count, dtype: int64

Saved balanced dataset to: /kaggle/working//balanced_physi

In [3]:
import pandas as pd
import csv
import os
import json
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from transformers import LongformerTokenizerFast
from datasets import Dataset
import logging
from collections import Counter
import torch

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('/kaggle/working/preprocessing_log.txt'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Load CSV
file_path = '/kaggle/input/physics/physics_papers.csv'
print("Loading dataset...")
df = pd.read_csv(file_path, quoting=csv.QUOTE_ALL, on_bad_lines='warn', engine='python')

# Combine title and abstract
df['title_abstract'] = df['title'].astype(str) + " " + df['abstract'].astype(str)

# Extract physics.* subcategories
physics_prefix = 'physics.'

def extract_physics_subcategories(categories_series):
    physics_subcats = set()
    for entry in categories_series.dropna():
        for cat in entry.split():
            if cat.startswith(physics_prefix):
                physics_subcats.add(cat[len(physics_prefix):])
    return sorted(physics_subcats)

print("Extracting physics subcategories...")
physics_subcategories = extract_physics_subcategories(df['categories'])

print(f"Found {len(physics_subcategories)} physics subcategories:")
for cat in physics_subcategories:
    print(cat)

# Extract primary physics category
def extract_primary_category(cat_string):
    if pd.isna(cat_string):
        return None
    for cat in cat_string.split():
        if cat.startswith(physics_prefix):
            return cat[len(physics_prefix):]
    return None

df['category'] = df['categories'].apply(extract_primary_category)

# Filter to physics subcategories
df = df[df['category'].isin(physics_subcategories)].copy()

# Label mapping
label_map = {cat: i for i, cat in enumerate(physics_subcategories)}
df['label'] = df['category'].map(label_map)

# Balance data: up to 10,000 per category
max_rows_per_category = 10000
print(f"\nSampling up to {max_rows_per_category} rows per subcategory...")
balanced_df = pd.concat([
    df[df['category'] == cat].sample(n=min(len(df[df['category'] == cat]), max_rows_per_category), random_state=42)
    for cat in physics_subcategories
], ignore_index=True)

print(f"\nFinal dataset size: {len(balanced_df)}")
print("Label distribution:")
print(balanced_df['label'].value_counts())

# Save label map
output_dir = '/kaggle/working/'
label_map_path = os.path.join(output_dir, 'label_map.json')
with open(label_map_path, 'w') as f:
    json.dump(label_map, f, indent=2)

# Train/Val/Test split
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, stratify=balanced_df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# Save test set for evaluation
test_df.to_csv(os.path.join(output_dir, 'test_physics_papers.csv'), index=False)

# Tokenizer setup
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

def tokenize_batch(texts, batch_size=32):
    tokenized = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(
            batch_texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=1024,
            return_tensors='pt'
        )
        tokenized.append({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        })
    return tokenized

# Tokenize
print("Tokenizing train and val sets...")
train_texts = train_df['title_abstract']
val_texts = val_df['title_abstract']
train_labels = train_df['label']
val_labels = val_df['label']

train_tokenized = tokenize_batch(train_texts)
val_tokenized = tokenize_batch(val_texts)

# Add labels
for batch, labels in zip(train_tokenized, [train_labels[i:i+32] for i in range(0, len(train_labels), 32)]):
    batch['labels'] = torch.tensor(labels.values, dtype=torch.long)
for batch, labels in zip(val_tokenized, [val_labels[i:i+32] for i in range(0, len(val_labels), 32)]):
    batch['labels'] = torch.tensor(labels.values, dtype=torch.long)

# Save tokenized data
with open(os.path.join(output_dir, 'traintokenized.pkl'), 'wb') as f:
    pickle.dump(train_tokenized, f)
with open(os.path.join(output_dir, 'valtokenized.pkl'), 'wb') as f:
    pickle.dump(val_tokenized, f)

print("Preprocessing complete. Tokenized datasets saved.")


Loading dataset...
Extracting physics subcategories...
Found 22 physics subcategories:
acc-ph
ao-ph
app-ph
atm-clus
atom-ph
bio-ph
chem-ph
class-ph
comp-ph
data-an
ed-ph
flu-dyn
gen-ph
geo-ph
hist-ph
ins-det
med-ph
optics
plasm-ph
pop-ph
soc-ph
space-ph

Sampling up to 10000 rows per subcategory...

Final dataset size: 165688
Label distribution:
label
4     10000
2     10000
5     10000
11    10000
8     10000
6     10000
18    10000
20    10000
17    10000
15    10000
12     9291
9      7705
7      7077
0      7002
16     6551
1      5760
13     5653
14     4724
21     4601
10     3620
3      1970
19     1734
Name: count, dtype: int64
Train size: 115981, Val size: 24853, Test size: 24854


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Tokenizing train and val sets...
Preprocessing complete. Tokenized datasets saved.


In [None]:
import pandas as pd
import csv
import os
import json
import numpy as np
import pickle
import torch
import gc
import glob
import re
import shutil
import logging
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    LongformerTokenizerFast,
    LongformerForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# Setup
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/training_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# File paths
input_dir = '/kaggle/working/'
train_tokenized_path = os.path.join(input_dir, 'traintokenized.pkl')
val_tokenized_path = os.path.join(input_dir, 'valtokenized.pkl')
results_dir = os.path.join(input_dir, 'results')
os.makedirs(results_dir, exist_ok=True)

# Tokenizer
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer.model_max_length = 512

# Load tokenized data
with open(train_tokenized_path, 'rb') as f:
    train_tokenized = pickle.load(f)
with open(val_tokenized_path, 'rb') as f:
    val_tokenized = pickle.load(f)

# Adaptive attention masks
def generate_adaptive_attention_masks(input_ids, tokenizer, important_tokens=['<s>']):
    attention_masks = []
    important_token_ids = tokenizer.convert_tokens_to_ids(important_tokens)
    for seq in input_ids:
        mask = torch.zeros(len(seq), dtype=torch.long)
        for token_id in important_token_ids:
            mask |= (torch.tensor(seq) == token_id)
        attention_masks.append(mask.tolist())
    return attention_masks

# Convert tokenized data to Huggingface Dataset
def convert_to_dataset(tokenized_data):
    input_ids, attention_mask, labels = [], [], []

    for batch in tokenized_data:
        input_ids.extend(batch['input_ids'])
        attention_mask.extend(batch['attention_mask'])
        batch_labels = batch['labels'].tolist() if isinstance(batch['labels'], torch.Tensor) else batch['labels']
        labels.extend(batch_labels)

    global_attention_mask = generate_adaptive_attention_masks(input_ids, tokenizer)

    dataset = Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'global_attention_mask': global_attention_mask,
        'labels': labels
    })
    return dataset.map(lambda x: {'labels': int(x['labels'])})

# Prepare datasets
train_dataset = convert_to_dataset(train_tokenized)
val_dataset = convert_to_dataset(val_tokenized)
del train_tokenized, val_tokenized
gc.collect()
torch.cuda.empty_cache()

# Class weights
train_labels = np.array(train_dataset['labels'])
num_labels = len(np.unique(train_labels))
class_weights_tensor = torch.tensor(
    compute_class_weight(class_weight='balanced', classes=np.arange(num_labels), y=train_labels),
    dtype=torch.float
)

# Load model
checkpoint_dirs = glob.glob(os.path.join(results_dir, 'checkpoint-*'))
checkpoint_path = max(checkpoint_dirs, key=os.path.getmtime) if checkpoint_dirs else None
model = LongformerForSequenceClassification.from_pretrained(
    checkpoint_path if checkpoint_path else 'allenai/longformer-base-4096',
    num_labels=num_labels,
    attention_window=256,
    ignore_mismatched_sizes=True
)
model.gradient_checkpointing_enable()
model.to(torch.device('cuda'))

# Custom trainer with class weights
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=os.path.join(input_dir, 'logs'),
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    fp16=True,
    report_to='none'
)

# Train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {'accuracy': (np.argmax(pred.predictions, axis=1) == pred.label_ids).mean()},
    data_collator=DataCollatorWithPadding(tokenizer),
    class_weights=class_weights_tensor
)

logger.info("Starting training...")
trainer.train(resume_from_checkpoint=checkpoint_path if checkpoint_path else None)

# Save final model
final_model_path = os.path.join(input_dir, 'final_model')
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Evaluate
metrics = trainer.evaluate()
pd.DataFrame([metrics]).to_csv(os.path.join(input_dir, "final_eval_metrics.csv"), index=False)
logger.info(f"Training completed. Model saved to {final_model_path}. Metrics saved.")


2025-05-09 09:44:53.730936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746783893.916051      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746783893.967810      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  mask |= (torch.tensor(seq) == token_id)


Map:   0%|          | 0/115981 [00:00<?, ? examples/s]

Map:   0%|          | 0/24853 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
