In [4]:
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder

print("[Step 1] Reading arXiv JSON in chunks...")

# Path to dataset (correct if added via Kaggle “Add Data” panel)
file_path = "/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"

econ_data = pd.DataFrame()
chunk_num = 0

# Memory-efficient loading
for chunk in pd.read_json(file_path, lines=True, chunksize=100_000):
    chunk_num += 1
    print(f"Processing chunk {chunk_num}...")
    
    # Extract primary category (first tag)
    chunk['primary_category'] = chunk['categories'].str.split().str[0]
    
    # Filter for only econ.* categories
    econ_chunk = chunk[chunk['primary_category'].str.startswith('econ.')][['id', 'title', 'abstract', 'categories', 'primary_category']]
    
    # Append to main econ_data
    econ_data = pd.concat([econ_data, econ_chunk])
    
    # Free memory
    del chunk, econ_chunk
    gc.collect()

print(f" Found {len(econ_data)} economics papers.")


[Step 1] Reading arXiv JSON in chunks...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
 Found 8833 economics papers.


In [6]:
# Save for reuse
econ_data.to_csv('/kaggle/working/econ_filtered.csv', index=False)
print(" Saved to /kaggle/working/econ_filtered.csv")


 Saved to /kaggle/working/econ_filtered.csv


# Add Label Column Using LabelEncoder &  Sentence Splitting and Combine Text

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

# Reload economics dataset
df = pd.read_csv('/kaggle/working/econ_filtered.csv')

# Recreate 'primary_category' column
df['primary_category'] = df['categories'].str.split().str[0]

# ✅ Encode label from primary_category
le = LabelEncoder()
df['label'] = le.fit_transform(df['primary_category'])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
print(label_mapping)

# Combine title + abstract into text
df['text'] = df['title'] + ". " + df['abstract']

# Tokenize text into list of sentences
df['sentences'] = df['text'].apply(sent_tokenize)

# ✅ Now select only what you need for LNLF
df = df[['sentences', 'label']]
df.head(20)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Label Mapping:
{'econ.EM': 0, 'econ.GN': 1, 'econ.TH': 2}


Unnamed: 0,sentences,label
0,[Fiscal stimulus as an optimal control problem...,1
1,[Identification and Estimation of Multidimensi...,1
2,[Comprehensive Time-Series Regression Models U...,1
3,[On Game-Theoretic Risk Management (Part One) ...,1
4,[On Game-Theoretic Risk Management (Part Two) ...,1
5,[The Mittag-Leffler Fitting of the Phillips Cu...,1
6,[Economic Development and Inequality: a comple...,1
7,[Banks as Tanks: A Continuous-Time Model of Fi...,1
8,[Unfolding the innovation system for the devel...,1
9,[Technology networks: the autocatalytic origin...,1


#  Install & Load Tokenizer

In [15]:
from transformers import LongformerTokenizerFast
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

# Custom Dataset Class for Longformer-LNLF

In [29]:
from transformers import LongformerTokenizerFast
import torch
from torch.utils.data import Dataset

# Load tokenizer
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")

# Parameters tuned for P100
MAX_SENT_LEN = 128
MAX_SENTS = 12

class EconLongformerDataset(Dataset):
    def __init__(self, df, tokenizer, max_sent_len=MAX_SENT_LEN, max_sents=MAX_SENTS):
        self.df = df
        self.tokenizer = tokenizer
        self.max_sent_len = max_sent_len
        self.max_sents = max_sents

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentences = row['sentences'][:self.max_sents]
        label = row['label']

        input_ids = []
        attention_masks = []

        for sent in sentences:
            encoded = self.tokenizer(
                sent,
                padding='max_length',
                truncation=True,
                max_length=self.max_sent_len,
                return_tensors='pt'
            )
            input_ids.append(encoded['input_ids'].squeeze(0))
            attention_masks.append(encoded['attention_mask'].squeeze(0))

        # Pad sentence count to MAX_SENTS
        while len(input_ids) < self.max_sents:
            input_ids.append(torch.zeros(self.max_sent_len, dtype=torch.long))
            attention_masks.append(torch.zeros(self.max_sent_len, dtype=torch.long))

        return {
            'input_ids': torch.stack(input_ids),            # shape: [max_sents, max_sent_len]
            'attention_mask': torch.stack(attention_masks), # shape: [max_sents, max_sent_len]
            'label': torch.tensor(label, dtype=torch.long)
        }


# Create Dataset and DataLoader

In [31]:
from torch.utils.data import DataLoader, random_split

# Create dataset
dataset = EconLongformerDataset(df, tokenizer)

# Split for train/test
train_size = int(0.80 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Dataloaders (batch_size is small due to Longformer memory)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

print(" Dataset and DataLoaders ready.")
from torch.utils.data import DataLoader, random_split



 Dataset and DataLoaders ready.


# Define the Longformer-LNLF Model

In [33]:
import torch.nn as nn
from transformers import LongformerModel

class LongformerLNLFClassifier(nn.Module):
    def __init__(self, num_labels, base_model='allenai/longformer-base-4096',
                 max_sent_len=128, max_sents=12, hidden_size=768):
        super(LongformerLNLFClassifier, self).__init__()

        self.max_sents = max_sents
        self.max_sent_len = max_sent_len
        self.hidden_size = hidden_size

        # Longformer encoder
        self.longformer = LongformerModel.from_pretrained(base_model)
        self.longformer.gradient_checkpointing_enable()

        # Document-level transformer over sentence embeddings
        self.doc_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=4),
            num_layers=1
        )

        # Adaptive attention controller
        self.controller = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        B, S, T = input_ids.size()  # [batch, max_sents, max_len]

        input_ids = input_ids.view(-1, T)           # [B*S, T]
        attention_mask = attention_mask.view(-1, T) # [B*S, T]

        # Sentence-level encoding
        outputs = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_tokens = outputs.last_hidden_state[:, 0, :]  # [B*S, H]
        cls_tokens = cls_tokens.view(B, S, self.hidden_size)

        # Document-level transformer
        doc_encoded = self.doc_encoder(cls_tokens)  # [B, S, H]

        # Adaptive weights
        weights = self.controller(doc_encoded).squeeze(-1)  # [B, S]
        soft_weights = torch.sigmoid(weights)               # [B, S]
        pooled = torch.sum(doc_encoded * soft_weights.unsqueeze(-1), dim=1) / \
                 (soft_weights.sum(dim=1, keepdim=True) + 1e-5)  # [B, H]

        logits = self.classifier(pooled)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}


In [27]:
import torch
device = torch.device("cpu")


In [37]:
import torch
from torch.optim import AdamW
from tqdm import tqdm

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model
model = LongformerLNLFClassifier(
    num_labels=len(df['label'].unique()),
    max_sent_len=64,
    max_sents=6
).to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 2  # You can increase to 3–5 for better results

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f" Epoch {epoch+1} | Avg Loss: {total_loss / len(train_loader):.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 79.12 MiB is free. Process 5052 has 15.81 GiB memory in use. Of the allocated memory 15.24 GiB is allocated by PyTorch, and 291.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [11]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

# Combine title and abstract
df['text'] = df['title'] + ". " + df['abstract']

# Tokenize into sentences
df['sentences'] = df['text'].apply(sent_tokenize)

# Final dataframe for LNLF model
df = df[['sentences', 'label']]


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: "['label'] not in index"

In [10]:
!pip install -q nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd

# Load your saved economics dataset
df = pd.read_csv('/kaggle/working/econ_filtered.csv')
df.dropna(subset=['title', 'abstract'], inplace=True)

# Combine title + abstract
df['text'] = df['title'] + ". " + df['abstract']

# Split into sentences
df['sentences'] = df['text'].apply(lambda x: sent_tokenize(x))

# Keep label and sentence data
df = df[['sentences', 'label']]
print(" Sentence tokenization complete.")
print(df.head(10))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: "['label'] not in index"

In [None]:
df.to_pickle('/kaggle/working/econ_sentences.pkl')


In [8]:
df = pd.read_csv('/kaggle/working/econ_filtered.csv')
print(df.columns)  # Confirm 'label' exists


Index(['id', 'title', 'abstract', 'categories', 'primary_category'], dtype='object')
