In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/shrenikborad/personal/worldbank/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/shrenikborad/personal/worldbank/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/shrenikborad/personal/worldbank/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py

In [4]:
df = pd.read_csv("/Users/shrenikborad/personal/worldbank/data/initial_data.csv")
df = df.dropna(subset=['Title', 'Abstract'])
df['text'] = "[TITLE] " + df['Title'] + " [ABSTRACT] " + df['Abstract']

df["text"].head()


df['label'] = df['Method'].fillna("other")
df['label'] = df['Method'].apply(
    lambda x: x.strip().lower() if isinstance(x, str) and x.strip() else "other")

counts = df['label'].value_counts()
valid_labels = counts[counts >= 2].index 
print(f"Valid labels: {valid_labels}")
df = df[df['label'].isin(valid_labels)]

Valid labels: Index(['randomised controlled trial', 'fixed effects (incl. did)', 'other',
       'statistical matching', 'instrumental variable estimation',
       'regression discontinuity design', 'interrupted time series analysis',
       'synthetic control'],
      dtype='object', name='label')


In [5]:
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])
df['label_id'] = df['label_id'].astype(int)
# Save label mapping (optional)
label_map = dict(zip(label_encoder.classes_,
                 label_encoder.transform(label_encoder.classes_)))
print("Label Map:", label_map)

Label Map: {'fixed effects (incl. did)': np.int64(0), 'instrumental variable estimation': np.int64(1), 'interrupted time series analysis': np.int64(2), 'other': np.int64(3), 'randomised controlled trial': np.int64(4), 'regression discontinuity design': np.int64(5), 'statistical matching': np.int64(6), 'synthetic control': np.int64(7)}


In [6]:
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df['label_id'], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=42
)

In [8]:
from datasets import Dataset


num_labels = len(label_encoder.classes_)
print(f"Number of labels: {num_labels}")
MODEL_NAME = "allenai/scibert_scivocab_uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)


# Step 3: Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df[['text', 'label_id']])
val_ds = Dataset.from_pandas(val_df[['text', 'label_id']])
test_ds = Dataset.from_pandas(test_df[['text', 'label_id']])

train_ds = train_ds.map(tokenize_function, batched=True)
train_ds = train_ds.rename_column("label_id", "labels")
val_ds = val_ds.map(tokenize_function, batched=True)
val_ds = val_ds.rename_column("label_id", "labels")
test_ds = test_ds.map(tokenize_function, batched=True)
test_ds = test_ds.rename_column("label_id", "labels")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    # Print classification report automatically
    print("\nClassification Report:")
    target_names = label_encoder.classes_  # adjust as needed
    print(classification_report(labels, preds, target_names=target_names))

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average='macro'),
    }


training_args = TrainingArguments(
    output_dir="./dev-econ-classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro"
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

torch.cuda.empty_cache()

# Step 7: Train the model
trainer.train()

# Step 8: Evaluate on test set
results = trainer.evaluate(test_ds)
print("Test Results:", results)

# Optional: Save model
model.save_pretrained("method-classifier-wb")
tokenizer.save_pretrained("method-classifier-wb")


Number of labels: 8


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434