In [None]:
import json
import pandas as pd
from collections import Counter
import os

# Step 1: Load the arXiv dataset
input_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
data = []
print("Loading arXiv JSON lines dataset...")
with open(input_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
print(f"Total papers loaded: {len(data)}")

# Step 2: Convert to DataFrame
df = pd.DataFrame(data)
print("Available columns:")
print(df.columns.tolist())

# Step 3: Count papers per category
category_counts = Counter()
for cats in df['categories']:
    for cat in cats.split():
        category_counts[cat] += 1

category_df = pd.DataFrame(category_counts.items(), columns=['Category', 'Count'])
category_df = category_df.sort_values('Count', ascending=False)

# Step 4: Filter categories with >5,000 papers
popular_categories = category_df[category_df['Count'] > 5000]['Category'].tolist()
print(f"\nCategories with > 5,000 papers: {len(popular_categories)}")
print(category_df[category_df['Category'].isin(popular_categories)].head(20))

# Step 5: Filter papers that have at least one popular category
def has_popular_category(category_str):
    return any(cat in popular_categories for cat in category_str.split())

filtered_df = df[df['categories'].apply(has_popular_category)]
print(f"\nFiltered papers: {len(filtered_df)}")

# Step 6: Keep only relevant columns and enrich text
filtered_df = filtered_df[['title', 'abstract', 'categories']]
filtered_df = filtered_df.dropna()
filtered_df['title_abstract'] = filtered_df['title'] + " " + filtered_df['abstract']

# Step 7: Save outputs
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)

filtered_csv_path = os.path.join(output_dir, 'filtered_arxiv_papers.csv')
stats_csv_path = os.path.join(output_dir, 'category_statistics.csv')

filtered_df.to_csv(filtered_csv_path, index=False)
category_df.to_csv(stats_csv_path, index=False)

# Step 8: Print summary
print("\n Processing Complete")
print(f"Original total papers: {len(df)}")
print(f"Filtered papers: {len(filtered_df)}")
print(f"Filtered dataset saved to: {filtered_csv_path}")
print(f"Category statistics saved to: {stats_csv_path}")
print("\nSample rows:")
print(filtered_df.head())

Loading arXiv JSON lines dataset...
Total papers loaded: 2730173
Available columns:
['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']

Categories with > 5,000 papers: 128
               Category   Count
96                cs.LG  215824
0                hep-ph  187339
13               hep-th  173525
27             quant-ph  161515
114               cs.CV  154368
42                cs.AI  124844
7                 gr-qc  113475
9              astro-ph  105380
8     cond-mat.mtrl-sci   99501
6     cond-mat.mes-hall   95273
34              math.MP   83934
33              math-ph   83934
126               cs.CL   83033
20      cond-mat.str-el   77684
21   cond-mat.stat-mech   76808
136         astro-ph.CO   71512
1               math.CO   71159
110             stat.ML   70661
144         astro-ph.GA   69844
66              math.AP   67149

Filtered papers: 2702634


In [None]:
import pandas as pd
import os
import csv

# File paths
file_path = '/kaggle/working/filtered_arxiv_papers.csv'
stats_path = '/kaggle/working/category_statistics.csv'

# Check file
print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load dataset
print("Loading filtered_arxiv_papers.csv...")
df = pd.read_csv(file_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

# Enrich text
if 'title_abstract' not in df.columns:
    print("Enriching text column with title + abstract...")
    df['title_abstract'] = df['title'].astype(str) + " " + df['abstract'].astype(str)

# Load and filter category stats for cs.* only
print("Loading category statistics...")
stats_df = pd.read_csv(stats_path)
stats_df = stats_df[stats_df['Category'].str.startswith('cs.')]
stats_df = stats_df.sort_values('Count', ascending=False)

top_categories = stats_df['Category'].tolist()
print(f"\nUsing ALL {len(top_categories)} cs categories:")
print(top_categories)


In [None]:
import pandas as pd
import os
import csv

# File paths
file_path = '/kaggle/working/filtered_arxiv_papers.csv'
stats_path = '/kaggle/working/category_statistics.csv'
output_path = '/kaggle/working/cs_top10_dataset_labeled_balanced.csv'
label_map_path = '/kaggle/working/label_map_cs_top10.csv'

# Check file existence
print(f"Checking for {file_path}...")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load the dataset
print("Loading filtered_arxiv_papers.csv...")
df = pd.read_csv(file_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

# Enrich title_abstract if not present
if 'title_abstract' not in df.columns:
    print("Creating 'title_abstract' from title + abstract...")
    df['title'] = df['title'].fillna('')
    df['abstract'] = df['abstract'].fillna('')
    df['title_abstract'] = df['title'] + " " + df['abstract']

# Load category stats and filter top 10 cs.* categories
print("Loading and filtering category statistics...")
stats_df = pd.read_csv(stats_path)
stats_df = stats_df[stats_df['Category'].str.startswith('cs.')]
stats_df = stats_df.sort_values('Count', ascending=False)
top_categories = stats_df['Category'].head(10).tolist()

print(f"\nUsing Top 10 cs categories:")
print(top_categories)

# Create label mapping
label_map = {cat: idx for idx, cat in enumerate(top_categories)}

# Function to extract primary cs category
def extract_primary_category(cat_str):
    if pd.isna(cat_str):
        return None
    for cat in cat_str.split():
        if cat.startswith('cs.') and cat in label_map:
            return cat
    return None

# Apply category extraction and label mapping
print("Mapping categories to labels...")
df['category'] = df['categories'].apply(extract_primary_category)
df['label'] = df['category'].map(label_map)

# Filter to keep only rows from top 10 cs categories
df = df[df['label'].notnull()].copy()

# Balance: keep 3000 rows per category
print("\nBalancing to 3000 papers per category...")
balanced_dfs = []
skipped = []

for cat in top_categories:
    cat_df = df[df['category'] == cat]
    if len(cat_df) >= 3000:
        balanced_dfs.append(cat_df.sample(n=3000, random_state=42))
    else:
        skipped.append((cat, len(cat_df)))
        print(f"⚠ Skipped {cat} — only {len(cat_df)} papers (needs 3000)")

if not balanced_dfs:
    raise ValueError("No category has enough samples!")

df_balanced = pd.concat(balanced_dfs).reset_index(drop=True)

# Save label map to CSV
pd.Series(label_map).to_csv(label_map_path, header=['Label'], index_label='Category')
print(f"Saved label mapping to {label_map_path}")

# Drop unneeded columns
df_balanced = df_balanced.drop(columns=['title', 'abstract'])

# Save final balanced dataset
df_balanced.to_csv(output_path, index=False)
print(f"Saved cleaned and balanced dataset to {output_path}")

# Reload and print category distribution
print("\n--- Category Distribution ---")
df_loaded = pd.read_csv(output_path)
category_counts = df_loaded['category'].value_counts()

for category, count in category_counts.items():
    label = df_loaded[df_loaded['category'] == category]['label'].iloc[0]
    print(f"Category: {category} | Label: {label} | Papers: {count}")

print("\nTotal unique categories:", df_loaded['category'].nunique())
print("Total number of papers:", len(df_loaded))  


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load the balanced dataset
input_path = '/kaggle/working/cs_top10_dataset_labeled_balanced.csv'
df = pd.read_csv(input_path)

print(f"Loaded {len(df)} rows from balanced dataset")
print(df['label'].value_counts())

# Split: 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
)

# Save splits
output_dir = '/kaggle/working/'
train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

# Final report
print("\nSaved splits:")
print(f" Train: {len(train_df)}")
print(f" Val:   {len(val_df)}")
print(f" Test:  {len(test_df)}")


In [None]:
import pandas as pd
from transformers import LongformerTokenizerFast
import torch
import pickle
import logging
import os
import re

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/cs_step2_longformer_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Input/output
input_dir = '/kaggle/working/'
output_dir = '/kaggle/working/'

# Auto-locate CSVs
train_path, val_path = None, None
for root, _, files in os.walk(input_dir):
    for file in files:
        if file == 'train.csv':
            train_path = os.path.join(root, file)
        elif file == 'val.csv':
            val_path = os.path.join(root, file)
    if train_path and val_path:
        break

if not train_path or not val_path:
    logger.error("Missing train.csv or val.csv.")
    raise FileNotFoundError("train.csv or val.csv not found in /kaggle/working/")

logger.info(f"Found train: {train_path}")
logger.info(f"Found val: {val_path}")

# Load Longformer tokenizer
logger.info("Loading Longformer tokenizer...")
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

# Load data
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Fill missing text fields
train_df['title_abstract'] = train_df['title_abstract'].fillna("No content")
val_df['title_abstract'] = val_df['title_abstract'].fillna("No content")

# Clean text: remove special characters and lowercase
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)     # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text.lower()

logger.info("Cleaning special characters from text...")
train_df['title_abstract'] = train_df['title_abstract'].astype(str).apply(clean_text)
val_df['title_abstract'] = val_df['title_abstract'].astype(str).apply(clean_text)

# Tokenization function for Longformer
def tokenize_data(df, max_length=4096):
    texts = df['title_abstract'].tolist()
    labels = df['label'].astype(int).tolist()
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels)
    }

# Batch tokenize
batch_size = 100
train_tokenized, val_tokenized = [], []

logger.info("Tokenizing training data...")
for i in range(0, len(train_df), batch_size):
    batch_df = train_df[i:i+batch_size]
    tokenized = tokenize_data(batch_df)
    train_tokenized.append(tokenized)
    logger.info(f"Train batch {i//batch_size + 1}/{(len(train_df)-1)//batch_size + 1}")

logger.info("Tokenizing validation data...")
for i in range(0, len(val_df), batch_size):
    batch_df = val_df[i:i+batch_size]
    tokenized = tokenize_data(batch_df)
    val_tokenized.append(tokenized)
    logger.info(f"Val batch {i//batch_size + 1}/{(len(val_df)-1)//batch_size + 1}")

# Save tokenized data
with open(os.path.join(output_dir, 'cs_train_tokenized.pkl'), 'wb') as f:
    pickle.dump(train_tokenized, f)
with open(os.path.join(output_dir, 'cs_val_tokenized.pkl'), 'wb') as f:
    pickle.dump(val_tokenized, f)

logger.info(f"Saved: cs_train_tokenized.pkl ({len(train_tokenized)} batches), cs_val_tokenized.pkl ({len(val_tokenized)} batches)")


In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import torch
import gc
import glob
import logging
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    LongformerTokenizerFast,
    LongformerForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# Setup
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/kaggle/working/cs_training_log_longformer.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# File paths
input_dir = '/kaggle/working/'
train_tokenized_path = os.path.join(input_dir, 'cs_train_tokenized.pkl')
val_tokenized_path = os.path.join(input_dir, 'cs_val_tokenized.pkl')
results_dir = os.path.join(input_dir, 'cs_results_longformer')
os.makedirs(results_dir, exist_ok=True)

# Tokenizer
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

# Load tokenized data
with open(train_tokenized_path, 'rb') as f:
    train_tokenized = pickle.load(f)
with open(val_tokenized_path, 'rb') as f:
    val_tokenized = pickle.load(f)

# Convert tokenized data to Huggingface Dataset
def convert_to_dataset(tokenized_data):
    input_ids, attention_mask, labels = [], [], []
    for batch in tokenized_data:
        input_ids.extend(batch['input_ids'])
        attention_mask.extend(batch['attention_mask'])
        batch_labels = batch['labels'].tolist() if isinstance(batch['labels'], torch.Tensor) else batch['labels']
        labels.extend(batch_labels)
    dataset = Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    })
    return dataset.map(lambda x: {'labels': int(x['labels'])})

# Prepare datasets
train_dataset = convert_to_dataset(train_tokenized)
val_dataset = convert_to_dataset(val_tokenized)
del train_tokenized, val_tokenized
gc.collect()
torch.cuda.empty_cache()

# Class weights
train_labels = np.array(train_dataset['labels'])
num_labels = len(np.unique(train_labels))
class_weights_tensor = torch.tensor(
    compute_class_weight(class_weight='balanced', classes=np.arange(num_labels), y=train_labels),
    dtype=torch.float
)

# Load Longformer model
checkpoint_dirs = glob.glob(os.path.join(results_dir, 'checkpoint-*'))
checkpoint_path = max(checkpoint_dirs, key=os.path.getmtime) if checkpoint_dirs else None
model = LongformerForSequenceClassification.from_pretrained(
    checkpoint_path if checkpoint_path else 'allenai/longformer-base-4096',
    num_labels=num_labels
)
model.to(torch.device('cuda'))

# Custom trainer with class weights
# Custom trainer with class weights
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_dir=os.path.join(input_dir, 'cs_logs_longformer'),
    logging_steps=10,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=2,
    fp16=True,
    report_to='none'
)

# Train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {'accuracy': (np.argmax(pred.predictions, axis=1) == pred.label_ids).mean()},
    data_collator=DataCollatorWithPadding(tokenizer),
    class_weights=class_weights_tensor
)

logger.info("Starting Longformer training for CS top-10...")
trainer.train(resume_from_checkpoint=checkpoint_path if checkpoint_path else None)

# Save final model
final_model_path = os.path.join(input_dir, 'cs_final_model_longformer')
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Evaluate
metrics = trainer.evaluate()
pd.DataFrame([metrics]).to_csv(os.path.join(input_dir, "cs_final_eval_metrics_longformer.csv"), index=False)
logger.info(f"Training completed. Model saved to {final_model_path}. Metrics saved.")
