In [5]:
!pip install pandas numpy sentence_transformers scikit-learn xgboost torch tqdm catboost

Collecting pandas
  Using cached pandas-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting numpy
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sentence_transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 KB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [33]:
import pandas as pd
import ast
import numpy as np
from sentence_transformers import SentenceTransformer

In [34]:
# Load data
df = pd.read_csv("abstracts_titles_wdomains.csv") 

# Convert stringified lists to real lists
df['title_domains'] = df['title_domains'].apply(ast.literal_eval)
df['abstract_domains'] = df['abstract_domains'].apply(ast.literal_eval)


In [10]:
specter = SentenceTransformer('allenai/specter')

# Get embeddings for all titles
title_embeddings = specter.encode(df['title'].tolist(), show_progress_bar=True)

No sentence-transformers model found with name allenai/specter. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Batches:   0%|          | 0/268 [00:00<?, ?it/s]

In [13]:
np.save("title_embeddings.npy",title_embeddings)

In [35]:
title_embeddings=np.load("title_embeddings.npy")

In [36]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# One-hot encode title_domains and abstract_domains 
title_domain_features = mlb.fit_transform(df['title_domains'])
abstract_domain_targets = mlb.transform(df['abstract_domains'])

domain_classes = mlb.classes_


In [37]:
print(domain_classes)

['anatomy' 'brain_atlas' 'cognitive' 'disease' 'gene' 'medical_procedures'
 'metadata' 'molecule' 'phenotype' 'protein' 'taxonomy' 'treatment']


In [38]:
# Combine the semantic embedding with the one-hot domain features
import numpy as np

X = np.hstack([title_embeddings, title_domain_features])
y = abstract_domain_targets

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
model.fit(X_train, y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
from sklearn.metrics import classification_report, f1_score

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=domain_classes))

# Micro F1 for overall performance
print("Micro F1 Score:", f1_score(y_test, y_pred, average='micro'))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

                    precision    recall  f1-score   support

           anatomy       0.81      0.98      0.88      1311
       brain_atlas       1.00      0.07      0.13        70
         cognitive       0.75      0.39      0.51       656
           disease       0.97      1.00      0.99      1668
              gene       0.95      0.52      0.67       343
medical_procedures       0.82      0.99      0.90      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.74      0.34      0.47       500
         phenotype       0.89      1.00      0.94      1493
           protein       0.91      0.17      0.28       233
          taxonomy       1.00      0.16      0.27        82
         treatment       0.73      0.88      0.80      1032

         micro avg       0.85      0.83      0.84      8841
         macro avg       0.88      0.55      0.58      8841
      weighted avg       0.85      0.83      0.81      8841
       samples avg       0.85      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
import joblib

# Save model
joblib.dump(model, 'multilabel_rf_model.pkl')

# Later, load it like this:
# classifier = joblib.load('multilabel_rf_model.pkl')


['multilabel_rf_model.pkl']

In [10]:
from sklearn.metrics import accuracy_score
import numpy as np

def label_wise_accuracy(y_true, y_pred, label_names=None):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.values

    num_labels = y_true.shape[1]
    if label_names is None:
        label_names = [f'Label {i}' for i in range(num_labels)]

    print(f"{'Label':<25} | {'Accuracy':<8}")
    print("-" * 40)

    for i in range(num_labels):
        acc = accuracy_score(y_true[:, i], y_pred[:, i])
        print(f"{label_names[i]:<25} | {acc:.4f}")


In [11]:
label_names = mlb.classes_  
label_wise_accuracy(y_test, y_pred, label_names)

Label                     | Accuracy
----------------------------------------
anatomy                   | 0.8051
brain_atlas               | 0.9621
cognitive                 | 0.7165
disease                   | 0.9737
gene                      | 0.8979
medical_procedures        | 0.8215
metadata                  | 0.9557
molecule                  | 0.7730
phenotype                 | 0.8891
protein                   | 0.8845
taxonomy                  | 0.9597
treatment                 | 0.7310


In [16]:
# to predict the domains on new titles
def predict_domains(title, predicted_title_domains):
    title_vec = specter.encode([title])
    title_domain_vec = mlb.transform([predicted_title_domains])
    combined = np.hstack([title_vec, title_domain_vec])
    predicted = model.predict(combined)
    return [domain_classes[i] for i, v in enumerate(predicted[0]) if v == 1]

# Example
predict_domains("Bilateral Optic Neuroretinitis: Uncommon Complication of COVID-19", ['disease'])

#the actual abstract domains has ['phenotype', 'disease', 'molecule', 'anatomy'], so it has missed 'molecule'


['anatomy', 'disease', 'phenotype']

In [14]:
for i in range(5):
    abstr=str(df.loc[i, 'abstract_domains'])
    titl=str(df.loc[i, 'title_domains'])
    print(f"{titl} -> {abstr}")

['disease'] -> ['phenotype', 'disease', 'molecule', 'anatomy']
['medical_procedures', 'treatment', 'disease'] -> ['medical_procedures', 'treatment', 'phenotype', 'cognitive', 'disease']
['phenotype', 'disease'] -> ['metadata', 'medical_procedures', 'phenotype', 'cognitive', 'disease']
['disease', 'anatomy'] -> ['medical_procedures', 'treatment', 'phenotype', 'anatomy', 'disease']
['phenotype', 'disease', 'protein'] -> ['medical_procedures', 'treatment', 'molecule', 'phenotype', 'protein', 'anatomy', 'brain_atlas', 'disease']


In [50]:
!pip install imblearn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0m
Collecting sklearn-compat<1,>=0.1
  Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Collecting scikit-learn<2,>=1.3.2
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn, sklearn-compat, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.7.0
    Uninstalling scikit-learn-1.7.0:
      Successfully uninstalled scikit-learn-1.7.0
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 

**XGBoost model**

In [18]:
!pip install xgboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.2
[0m

In [19]:
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_jobs=-1,
    n_estimators=200,
    random_state=42
))

model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
from sklearn.metrics import classification_report, f1_score

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=domain_classes))

# Micro F1 for overall performance
print("Micro F1 Score:", f1_score(y_test, y_pred, average='micro'))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

                    precision    recall  f1-score   support

           anatomy       0.82      0.94      0.88      1311
       brain_atlas       0.83      0.07      0.13        70
         cognitive       0.65      0.50      0.56       656
           disease       0.97      1.00      0.99      1668
              gene       0.85      0.62      0.72       343
medical_procedures       0.83      0.96      0.89      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.68      0.46      0.55       500
         phenotype       0.89      0.99      0.94      1493
           protein       0.68      0.31      0.43       233
          taxonomy       0.93      0.16      0.27        82
         treatment       0.74      0.82      0.78      1032

         micro avg       0.84      0.84      0.84      8841
         macro avg       0.82      0.58      0.61      8841
      weighted avg       0.83      0.84      0.82      8841
       samples avg       0.84      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
label_names = mlb.classes_  
label_wise_accuracy(y_test, y_pred, label_names)

Label                     | Accuracy
----------------------------------------
anatomy                   | 0.7981
brain_atlas               | 0.9615
cognitive                 | 0.7042
disease                   | 0.9743
gene                      | 0.9032
medical_procedures        | 0.8063
metadata                  | 0.9557
molecule                  | 0.7789
phenotype                 | 0.8839
protein                   | 0.8862
taxonomy                  | 0.9592
treatment                 | 0.7229


## Using optuna for hyperparameter tuning

In [23]:
!pip install optuna

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 KB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
Collecting sqlalchemy>=1.4.2
  Downloading sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting Mako
  Downloading mako-1.3.10-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Collecting greenlet>=1
  Downloading greenlet-3.2.3-cp310-cp310-ma

In [26]:
import optuna
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

def objective(trial):
    # Define search space
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_jobs": -1,
        "tree_method": "hist",  # Speed up on large datasets
        "verbosity": 0
    }

    base_model = XGBClassifier(**param)
    model = MultiOutputClassifier(base_model)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return f1_score(y_test, y_pred, average='micro')  # You can also try 'macro'



In [28]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2025-07-04 09:53:03,142] A new study created in memory with name: no-name-29e7be82-5d86-4687-a0d9-2d465ea9af56
[I 2025-07-04 09:57:14,692] Trial 0 finished with value: 0.8414378714973111 and parameters: {'n_estimators': 314, 'max_depth': 7, 'learning_rate': 0.0911073807685825, 'subsample': 0.9100792698605935, 'colsample_bytree': 0.9946004865044926, 'reg_alpha': 2.6034600792274474, 'reg_lambda': 0.049162481791732424}. Best is trial 0 with value: 0.8414378714973111.
[I 2025-07-04 09:58:51,264] Trial 1 finished with value: 0.8391988231300215 and parameters: {'n_estimators': 105, 'max_depth': 7, 'learning_rate': 0.10179296962719302, 'subsample': 0.5528051806081837, 'colsample_bytree': 0.551862551557662, 'reg_alpha': 1.5468355316358617e-06, 'reg_lambda': 1.6100170250449865e-08}. Best is trial 0 with value: 0.8414378714973111.
[I 2025-07-04 10:02:13,663] Trial 2 finished with value: 0.8413263288708128 and parameters: {'n_estimators': 145, 'max_depth': 11, 'learning_rate': 0.09294029981634

In [29]:
print("Best Trial:")
print(study.best_trial.params)

# Train final model with best parameters
best_params = study.best_trial.params
final_model = MultiOutputClassifier(XGBClassifier(**best_params))
final_model.fit(X_train, y_train)


Best Trial:
{'n_estimators': 314, 'max_depth': 7, 'learning_rate': 0.0911073807685825, 'subsample': 0.9100792698605935, 'colsample_bytree': 0.9946004865044926, 'reg_alpha': 2.6034600792274474, 'reg_lambda': 0.049162481791732424}


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9946004865044926
,device,
,early_stopping_rounds,
,enable_categorical,False


In [30]:
y_pred = final_model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=domain_classes))

# Micro F1 for overall performance
print("Micro F1 Score:", f1_score(y_test, y_pred, average='micro'))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

                    precision    recall  f1-score   support

           anatomy       0.82      0.95      0.88      1311
       brain_atlas       1.00      0.09      0.16        70
         cognitive       0.66      0.46      0.54       656
           disease       0.97      1.00      0.99      1668
              gene       0.88      0.62      0.73       343
medical_procedures       0.83      0.97      0.90      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.68      0.44      0.53       500
         phenotype       0.89      0.99      0.94      1493
           protein       0.67      0.28      0.40       233
          taxonomy       1.00      0.17      0.29        82
         treatment       0.75      0.84      0.79      1032

         micro avg       0.84      0.84      0.84      8841
         macro avg       0.85      0.58      0.61      8841
      weighted avg       0.84      0.84      0.82      8841
       samples avg       0.85      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [31]:
label_names = mlb.classes_  
label_wise_accuracy(y_test, y_pred, label_names)

Label                     | Accuracy
----------------------------------------
anatomy                   | 0.8063
brain_atlas               | 0.9627
cognitive                 | 0.7025
disease                   | 0.9737
gene                      | 0.9072
medical_procedures        | 0.8203
metadata                  | 0.9557
molecule                  | 0.7754
phenotype                 | 0.8880
protein                   | 0.8833
taxonomy                  | 0.9603
treatment                 | 0.7305


In [33]:
import joblib
joblib.dump(final_model, "xgb_multilabel_optuna.pkl")

['xgb_multilabel_optuna.pkl']

## Using Specter+MLP

In [32]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm


In [41]:
class PaperDataset(Dataset):
    def __init__(self, specter_embeds, title_domain_ohe, labels):
        self.inputs = torch.tensor(np.hstack([specter_embeds, title_domain_ohe]), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]


In [38]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, output_dim),
            nn.Sigmoid()  # multi-label output
        )

    def forward(self, x):
        return self.model(x)


In [35]:
def train_model(model, train_loader, val_loader, epochs=10, lr=1e-4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")


In [36]:
def evaluate(model, val_loader, label_names):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    all_preds, all_labels = [], []

    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch = x_batch.to(device)
            preds = model(x_batch)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(y_batch.numpy())

    y_pred = (np.vstack(all_preds) >= 0.5).astype(int)
    y_true = np.vstack(all_labels)

    print(classification_report(y_true, y_pred, target_names=label_names, zero_division=0))


In [46]:
X = np.hstack([title_embeddings, title_domain_features])
input_dim = X.shape[1]
output_dim = abstract_domain_targets.shape[1]

X_train, X_val, y_train, y_val = train_test_split(X, abstract_domain_targets, test_size=0.2, random_state=42)

train_dataset = PaperDataset(X_train[:, :768], X_train[:, 768:], y_train)
val_dataset = PaperDataset(X_val[:, :768], X_val[:, 768:], y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

model = MLPClassifier(input_dim, output_dim)
train_model(model, train_loader, val_loader, epochs=10, lr=1e-4)

evaluate(model, val_loader, label_names=domain_classes)


Epoch 1/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 602.21it/s]


Epoch 1 Loss: 0.4646


Epoch 2/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 629.26it/s]


Epoch 2 Loss: 0.3841


Epoch 3/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 634.63it/s]


Epoch 3 Loss: 0.3715


Epoch 4/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 627.84it/s]


Epoch 4 Loss: 0.3641


Epoch 5/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 631.37it/s]


Epoch 5 Loss: 0.3582


Epoch 6/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 625.81it/s]


Epoch 6 Loss: 0.3564


Epoch 7/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 624.03it/s]


Epoch 7 Loss: 0.3540


Epoch 8/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 619.54it/s]


Epoch 8 Loss: 0.3515


Epoch 9/10: 100%|███████████████████████████████████████| 108/108 [00:00<00:00, 615.71it/s]


Epoch 9 Loss: 0.3492


Epoch 10/10: 100%|██████████████████████████████████████| 108/108 [00:00<00:00, 615.71it/s]


Epoch 10 Loss: 0.3478
                    precision    recall  f1-score   support

           anatomy       0.79      0.96      0.87      1311
       brain_atlas       0.00      0.00      0.00        70
         cognitive       0.67      0.34      0.45       656
           disease       0.97      1.00      0.99      1668
              gene       0.85      0.56      0.68       343
medical_procedures       0.80      1.00      0.89      1370
          metadata       0.00      0.00      0.00        83
          molecule       0.60      0.42      0.49       500
         phenotype       0.87      1.00      0.93      1493
           protein       0.59      0.31      0.40       233
          taxonomy       0.00      0.00      0.00        82
         treatment       0.73      0.81      0.77      1032

         micro avg       0.82      0.83      0.83      8841
         macro avg       0.57      0.53      0.54      8841
      weighted avg       0.79      0.83      0.80      8841
       samples a

In [60]:
!pip install catboost lightgbm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
[0m

In [30]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

base_models_1 = [
    ('rf', RandomForestClassifier(n_estimators=30, max_depth=5, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=3)),
    ('logreg', LogisticRegression(max_iter=100, random_state=42))
]

base_models_2 = [
    ('adaboost', AdaBoostClassifier(n_estimators=30, learning_rate=0.5, random_state=42)),
    ('catboost', CatBoostClassifier(iterations=20, learning_rate=0.2, depth=3, verbose=0, random_state=42)),
    ('xgboost', XGBClassifier(n_estimators=30, learning_rate=0.2, max_depth=3, use_label_encoder=False, eval_metric='logloss', random_state=42))
]

stack_1 = MultiOutputClassifier(
    StackingClassifier(estimators=base_models_1, final_estimator=LogisticRegression(max_iter=1000), passthrough=False)
)

stack_2 = MultiOutputClassifier(
    StackingClassifier(estimators=base_models_2, final_estimator=LogisticRegression(max_iter=1000), passthrough=False)
)

print("Fitting stack 1...")
stack_1.fit(X_train, y_train)

print("Fitting stack 2...")
stack_2.fit(X_train, y_train)


Fitting stack 1...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Fitting stack 2...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


0,1,2
,estimator,StackingClass...ax_iter=1000))
,n_jobs,

0,1,2
,estimator,
,n_estimators,30
,learning_rate,0.5
,algorithm,'deprecated'
,random_state,42

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [41]:
def get_combined_probabilities(model, X):
    
    all_class1_probs = []

    for i, estimator in enumerate(model.estimators_):
        try:
            prob = estimator.predict_proba(X)  # (n_samples, 2) or (n_samples,)
        except Exception as e:
            print(f"Estimator {i} failed with error: {e}")
            continue

        # prob can be 1D if only one class was seen
        if prob.ndim == 1:
            # assume it predicted class 1 only: convert to (n_samples, 2)
            prob = np.stack([1 - prob, prob], axis=1)
        elif prob.shape[1] == 1:
            # only one class seen, add dummy 0-probability for the other class
            prob = np.concatenate([1 - prob, prob], axis=1)
        elif prob.shape[1] > 2:
            # multiclass (not expected in multilabel binary setup)
            raise ValueError("Only binary classification per label is supported!")

        class1_prob = prob[:, 1]  # take P(class=1)
        all_class1_probs.append(class1_prob)

    return np.stack(all_class1_probs, axis=1)  # shape: (n_samples, n_labels)

train_prob1 = get_combined_probabilities(stack_1, X_train)
train_prob2 = get_combined_probabilities(stack_2, X_train)
combined_train_features = np.hstack((train_prob1, train_prob2))

test_prob1 = get_combined_probabilities(stack_1, X_test)
test_prob2 = get_combined_probabilities(stack_2, X_test)
combined_test_features = np.hstack((test_prob1, test_prob2))

final_meta_model = MultiOutputClassifier(LogisticRegression(max_iter=100, random_state=42))
print("Fitting final meta model...")
final_meta_model.fit(combined_train_features, y_train)

final_preds = final_meta_model.predict(combined_test_features)

Fitting final meta model...


In [42]:
def evaluate_metrics(y_true, y_pred, target_names=None):
    print("\n=== Overall Metrics ===")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Recall (macro): {recall_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"F1 Score (macro): {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")

    print("\n=== Class-wise Report ===")
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))

evaluate_metrics(y_test, final_preds, target_names=domain_classes)



=== Overall Metrics ===
Accuracy: 0.1593
Precision (macro): 0.6385
Recall (macro): 0.5602
F1 Score (macro): 0.5779

=== Class-wise Report ===
                    precision    recall  f1-score   support

           anatomy       0.82      0.90      0.85      1311
       brain_atlas       0.36      0.06      0.10        70
         cognitive       0.58      0.48      0.52       656
           disease       0.97      1.00      0.99      1668
              gene       0.80      0.63      0.71       343
medical_procedures       0.83      0.91      0.87      1370
          metadata       0.00      0.00      0.00        83
          molecule       0.62      0.49      0.54       500
         phenotype       0.89      0.97      0.93      1493
           protein       0.56      0.34      0.43       233
          taxonomy       0.48      0.15      0.22        82
         treatment       0.74      0.80      0.77      1032

         micro avg       0.82      0.82      0.82      8841
         macro 

In [43]:
import joblib

joblib.dump(stack_1, 'stack_1.pkl')
joblib.dump(stack_2, 'stack_2.pkl')
joblib.dump(final_meta_model, 'final_stacked_model.pkl')

['final_stacked_model.pkl']

## Fine-tuning Transformers for Multi-label classification

In [22]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [65]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
from tqdm import tqdm
import ast


In [66]:
# df = pd.read_csv("your_file.csv")

# df['title_domains'] = df['title_domains'].apply(ast.literal_eval)
# df['abstract_domains'] = df['abstract_domains'].apply(ast.literal_eval)

# Convert title + title_domains into one input string
df['input_text'] = df.apply(lambda row: row['title'] + " | " + " ".join(row['title_domains']), axis=1)

# Remove rows with no target labels
df = df[df['abstract_domains'].map(len) > 0]

# Binarize labels
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['abstract_domains'])
label_names = mlb.classes_


In [67]:
class PaperDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.FloatTensor(self.labels[idx])
        encoding = self.tokenizer(text,
                                  truncation=True,
                                  padding='max_length',
                                  max_length=self.max_len,
                                  return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label}


In [68]:
class SpecterClassifier(nn.Module):
    def __init__(self, n_labels):
        super(SpecterClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained("allenai/specter")
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, n_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        logits = self.classifier(cls_output)
        return logits


In [69]:
import torch
import torch.nn as nn

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')  # no reduction here
        self.reduction = reduction

    def forward(self, logits, targets):
        bce_loss = self.bce(logits, targets)
        pt = torch.exp(-bce_loss) #pt is probability of true class
        focal_loss = (1 - pt) ** self.gamma * bce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss  

In [70]:
def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [71]:
from sklearn.metrics import classification_report

def evaluate(model, dataloader, device):
    model.eval()
    preds, true = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            logits = model(input_ids, attention_mask).sigmoid().cpu().numpy()
            preds.extend(logits)
            true.extend(labels)
    preds_bin = (np.array(preds) >= 0.5).astype(int)
    print(classification_report(true, preds_bin, target_names=label_names, zero_division=0))


Loss function -> BCEWithLogitsLoss

In [42]:
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train, X_val, y_train, y_val = train_test_split(df['input_text'], Y, test_size=0.2, random_state=42)

train_ds = PaperDataset(X_train.tolist(), y_train, tokenizer)
val_ds = PaperDataset(X_val.tolist(), y_val, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)

model = SpecterClassifier(n_labels=Y.shape[1]).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}")
    evaluate(model, val_loader, device)


Epoch 1


100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.3756
                    precision    recall  f1-score   support

           anatomy       0.82      0.94      0.88      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.60      0.40      0.48       648
           disease       0.98      1.00      0.99      1661
              gene       0.83      0.56      0.67       304
medical_procedures       0.82      0.99      0.89      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.73      0.35      0.47       512
         phenotype       0.88      1.00      0.93      1493
           protein       0.60      0.24      0.34       247
          taxonomy       0.00      0.00      0.00        74
         treatment       0.71      0.92      0.80      1070

         micro avg       0.83      0.84      0.83      8842
         macro avg       0.58      0.53      0.54      8842
      weighted avg       0.80      0.84      0.81      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.3386
                    precision    recall  f1-score   support

           anatomy       0.83      0.92      0.87      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.55      0.57      0.56       648
           disease       0.98      1.00      0.99      1661
              gene       0.80      0.62      0.70       304
medical_procedures       0.82      0.98      0.89      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.69      0.48      0.56       512
         phenotype       0.88      1.00      0.93      1493
           protein       0.60      0.30      0.40       247
          taxonomy       0.37      0.18      0.24        74
         treatment       0.76      0.81      0.79      1070

         micro avg       0.82      0.85      0.83      8842
         macro avg       0.61      0.57      0.58      8842
      weighted avg       0.80      0.85      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.3085
                    precision    recall  f1-score   support

           anatomy       0.85      0.85      0.85      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.60      0.49      0.54       648
           disease       0.98      1.00      0.99      1661
              gene       0.85      0.64      0.73       304
medical_procedures       0.82      0.98      0.89      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.62      0.62      0.62       512
         phenotype       0.88      1.00      0.93      1493
           protein       0.57      0.23      0.33       247
          taxonomy       0.70      0.09      0.17        74
         treatment       0.74      0.91      0.82      1070

         micro avg       0.83      0.85      0.84      8842
         macro avg       0.63      0.57      0.57      8842
      weighted avg       0.81      0.85      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.01it/s]


Train Loss: 0.2707
                    precision    recall  f1-score   support

           anatomy       0.85      0.87      0.86      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.58      0.53      0.56       648
           disease       0.98      1.00      0.99      1661
              gene       0.77      0.70      0.73       304
medical_procedures       0.85      0.92      0.88      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.68      0.53      0.59       512
         phenotype       0.88      0.99      0.93      1493
           protein       0.57      0.45      0.50       247
          taxonomy       0.58      0.19      0.29        74
         treatment       0.78      0.78      0.78      1070

         micro avg       0.84      0.83      0.83      8842
         macro avg       0.63      0.58      0.59      8842
      weighted avg       0.82      0.83      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.01it/s]


Train Loss: 0.2265
                    precision    recall  f1-score   support

           anatomy       0.83      0.92      0.88      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.54      0.61      0.57       648
           disease       0.98      1.00      0.99      1661
              gene       0.80      0.67      0.73       304
medical_procedures       0.84      0.91      0.88      1381
          metadata       1.00      0.01      0.03        70
          molecule       0.66      0.56      0.61       512
         phenotype       0.89      0.99      0.94      1493
           protein       0.52      0.48      0.50       247
          taxonomy       0.55      0.16      0.25        74
         treatment       0.78      0.79      0.79      1070

         micro avg       0.82      0.85      0.84      8842
         macro avg       0.70      0.59      0.60      8842
      weighted avg       0.82      0.85      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.00it/s]


Train Loss: 0.1838
                    precision    recall  f1-score   support

           anatomy       0.85      0.89      0.87      1319
       brain_atlas       0.40      0.03      0.06        63
         cognitive       0.60      0.51      0.55       648
           disease       0.98      1.00      0.99      1661
              gene       0.74      0.69      0.72       304
medical_procedures       0.88      0.79      0.83      1381
          metadata       1.00      0.04      0.08        70
          molecule       0.60      0.69      0.64       512
         phenotype       0.90      0.97      0.93      1493
           protein       0.55      0.49      0.52       247
          taxonomy       0.55      0.24      0.34        74
         treatment       0.76      0.82      0.79      1070

         micro avg       0.83      0.83      0.83      8842
         macro avg       0.73      0.60      0.61      8842
      weighted avg       0.83      0.83      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.00it/s]


Train Loss: 0.1431
                    precision    recall  f1-score   support

           anatomy       0.86      0.87      0.86      1319
       brain_atlas       0.44      0.06      0.11        63
         cognitive       0.56      0.51      0.54       648
           disease       0.98      1.00      0.99      1661
              gene       0.72      0.76      0.74       304
medical_procedures       0.87      0.87      0.87      1381
          metadata       0.38      0.04      0.08        70
          molecule       0.62      0.65      0.63       512
         phenotype       0.90      0.96      0.93      1493
           protein       0.52      0.49      0.50       247
          taxonomy       0.50      0.22      0.30        74
         treatment       0.77      0.77      0.77      1070

         micro avg       0.83      0.83      0.83      8842
         macro avg       0.68      0.60      0.61      8842
      weighted avg       0.82      0.83      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 11.99it/s]


Train Loss: 0.1114
                    precision    recall  f1-score   support

           anatomy       0.85      0.85      0.85      1319
       brain_atlas       0.25      0.02      0.03        63
         cognitive       0.55      0.62      0.58       648
           disease       0.98      1.00      0.99      1661
              gene       0.78      0.65      0.71       304
medical_procedures       0.86      0.90      0.88      1381
          metadata       0.37      0.10      0.16        70
          molecule       0.63      0.60      0.61       512
         phenotype       0.90      0.96      0.93      1493
           protein       0.58      0.38      0.46       247
          taxonomy       0.55      0.30      0.39        74
         treatment       0.78      0.78      0.78      1070

         micro avg       0.83      0.83      0.83      8842
         macro avg       0.67      0.60      0.61      8842
      weighted avg       0.82      0.83      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 11.99it/s]


Train Loss: 0.0869
                    precision    recall  f1-score   support

           anatomy       0.83      0.92      0.88      1319
       brain_atlas       0.12      0.11      0.12        63
         cognitive       0.59      0.52      0.55       648
           disease       0.98      1.00      0.99      1661
              gene       0.73      0.73      0.73       304
medical_procedures       0.87      0.88      0.87      1381
          metadata       0.21      0.19      0.20        70
          molecule       0.64      0.63      0.63       512
         phenotype       0.90      0.95      0.92      1493
           protein       0.53      0.51      0.52       247
          taxonomy       0.29      0.38      0.33        74
         treatment       0.77      0.80      0.78      1070

         micro avg       0.82      0.84      0.83      8842
         macro avg       0.62      0.63      0.63      8842
      weighted avg       0.81      0.84      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.01it/s]


Train Loss: 0.0664
                    precision    recall  f1-score   support

           anatomy       0.85      0.89      0.87      1319
       brain_atlas       0.21      0.11      0.15        63
         cognitive       0.58      0.54      0.56       648
           disease       0.98      1.00      0.99      1661
              gene       0.71      0.74      0.72       304
medical_procedures       0.87      0.89      0.88      1381
          metadata       0.46      0.19      0.27        70
          molecule       0.61      0.63      0.62       512
         phenotype       0.90      0.97      0.94      1493
           protein       0.50      0.59      0.54       247
          taxonomy       0.44      0.30      0.35        74
         treatment       0.77      0.82      0.79      1070

         micro avg       0.82      0.84      0.83      8842
         macro avg       0.66      0.64      0.64      8842
      weighted avg       0.82      0.84      0.83      8842
       samples avg 

In [44]:
torch.save(model.state_dict(), "specter_bce_finetuned_multilabel.pt")

In [36]:
import numpy as np
import torch

mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(df['abstract_domains'])
label_counts = binarized_labels.sum(axis=0)  
num_samples = binarized_labels.shape[0]
# Formula: (N - p) / p => for BCEWithLogitsLoss
pos_weights = (num_samples - label_counts) / (label_counts + 1e-5)
pos_weights_tensor = torch.tensor(pos_weights, dtype=torch.float32).to(device)  # send to same device as model


Loss function -> BCEWithLogitsLoss and class-wise weightage

In [38]:
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train, X_val, y_train, y_val = train_test_split(df['input_text'], Y, test_size=0.2, random_state=42)

train_ds = PaperDataset(X_train.tolist(), y_train, tokenizer)
val_ds = PaperDataset(X_val.tolist(), y_val, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)

model = SpecterClassifier(n_labels=Y.shape[1]).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights_tensor)

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}")
    evaluate(model, val_loader, device)


Epoch 1


100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.01it/s]


Train Loss: 0.7052
                    precision    recall  f1-score   support

           anatomy       0.88      0.53      0.66      1319
       brain_atlas       0.07      0.70      0.12        63
         cognitive       0.48      0.58      0.53       648
           disease       0.99      0.38      0.55      1661
              gene       0.45      0.88      0.60       304
medical_procedures       0.86      0.60      0.71      1381
          metadata       0.07      0.39      0.11        70
          molecule       0.45      0.79      0.57       512
         phenotype       0.90      0.78      0.84      1493
           protein       0.30      0.84      0.45       247
          taxonomy       0.14      0.49      0.22        74
         treatment       0.76      0.69      0.72      1070

         micro avg       0.61      0.61      0.61      8842
         macro avg       0.53      0.64      0.51      8842
      weighted avg       0.78      0.61      0.65      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.6320
                    precision    recall  f1-score   support

           anatomy       0.87      0.66      0.75      1319
       brain_atlas       0.06      0.68      0.12        63
         cognitive       0.55      0.43      0.48       648
           disease       0.99      0.76      0.86      1661
              gene       0.51      0.88      0.64       304
medical_procedures       0.88      0.61      0.72      1381
          metadata       0.06      0.70      0.12        70
          molecule       0.52      0.75      0.61       512
         phenotype       0.91      0.69      0.78      1493
           protein       0.31      0.83      0.46       247
          taxonomy       0.09      0.51      0.15        74
         treatment       0.76      0.70      0.73      1070

         micro avg       0.62      0.68      0.65      8842
         macro avg       0.54      0.68      0.53      8842
      weighted avg       0.79      0.68      0.71      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 11.99it/s]


Train Loss: 0.5612
                    precision    recall  f1-score   support

           anatomy       0.88      0.63      0.74      1319
       brain_atlas       0.09      0.49      0.15        63
         cognitive       0.53      0.56      0.54       648
           disease       0.99      0.77      0.87      1661
              gene       0.62      0.75      0.68       304
medical_procedures       0.87      0.68      0.76      1381
          metadata       0.09      0.41      0.15        70
          molecule       0.51      0.70      0.59       512
         phenotype       0.91      0.55      0.69      1493
           protein       0.38      0.70      0.49       247
          taxonomy       0.26      0.32      0.29        74
         treatment       0.73      0.78      0.76      1070

         micro avg       0.71      0.67      0.69      8842
         macro avg       0.57      0.61      0.56      8842
      weighted avg       0.80      0.67      0.72      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.00it/s]


Train Loss: 0.4760
                    precision    recall  f1-score   support

           anatomy       0.87      0.69      0.77      1319
       brain_atlas       0.07      0.46      0.12        63
         cognitive       0.53      0.54      0.54       648
           disease       0.98      0.82      0.90      1661
              gene       0.67      0.76      0.71       304
medical_procedures       0.87      0.64      0.74      1381
          metadata       0.07      0.53      0.13        70
          molecule       0.54      0.72      0.62       512
         phenotype       0.92      0.62      0.74      1493
           protein       0.37      0.73      0.49       247
          taxonomy       0.16      0.55      0.25        74
         treatment       0.79      0.65      0.71      1070

         micro avg       0.69      0.68      0.69      8842
         macro avg       0.57      0.64      0.56      8842
      weighted avg       0.81      0.68      0.73      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.01it/s]


Train Loss: 0.3958
                    precision    recall  f1-score   support

           anatomy       0.86      0.71      0.77      1319
       brain_atlas       0.11      0.35      0.17        63
         cognitive       0.47      0.71      0.57       648
           disease       0.98      0.81      0.89      1661
              gene       0.60      0.81      0.69       304
medical_procedures       0.88      0.70      0.78      1381
          metadata       0.11      0.40      0.18        70
          molecule       0.58      0.69      0.63       512
         phenotype       0.92      0.69      0.79      1493
           protein       0.34      0.78      0.47       247
          taxonomy       0.17      0.57      0.26        74
         treatment       0.78      0.74      0.76      1070

         micro avg       0.72      0.72      0.72      8842
         macro avg       0.57      0.66      0.58      8842
      weighted avg       0.80      0.72      0.75      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.3390
                    precision    recall  f1-score   support

           anatomy       0.88      0.65      0.75      1319
       brain_atlas       0.14      0.25      0.18        63
         cognitive       0.57      0.50      0.53       648
           disease       0.98      0.82      0.89      1661
              gene       0.62      0.78      0.69       304
medical_procedures       0.89      0.60      0.72      1381
          metadata       0.13      0.36      0.19        70
          molecule       0.56      0.71      0.63       512
         phenotype       0.93      0.59      0.72      1493
           protein       0.42      0.68      0.52       247
          taxonomy       0.15      0.43      0.22        74
         treatment       0.76      0.76      0.76      1070

         micro avg       0.75      0.67      0.71      8842
         macro avg       0.59      0.59      0.57      8842
      weighted avg       0.81      0.67      0.72      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.3032
                    precision    recall  f1-score   support

           anatomy       0.89      0.62      0.73      1319
       brain_atlas       0.14      0.22      0.17        63
         cognitive       0.53      0.60      0.57       648
           disease       0.99      0.79      0.88      1661
              gene       0.64      0.77      0.70       304
medical_procedures       0.89      0.58      0.70      1381
          metadata       0.11      0.43      0.18        70
          molecule       0.64      0.58      0.61       512
         phenotype       0.93      0.56      0.70      1493
           protein       0.43      0.67      0.53       247
          taxonomy       0.16      0.50      0.24        74
         treatment       0.78      0.75      0.76      1070

         micro avg       0.75      0.65      0.70      8842
         macro avg       0.59      0.59      0.56      8842
      weighted avg       0.82      0.65      0.71      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.03it/s]


Train Loss: 0.2644
                    precision    recall  f1-score   support

           anatomy       0.90      0.57      0.70      1319
       brain_atlas       0.15      0.22      0.18        63
         cognitive       0.52      0.67      0.59       648
           disease       0.99      0.64      0.78      1661
              gene       0.62      0.79      0.69       304
medical_procedures       0.88      0.65      0.75      1381
          metadata       0.07      0.40      0.12        70
          molecule       0.58      0.66      0.62       512
         phenotype       0.91      0.71      0.80      1493
           protein       0.48      0.59      0.53       247
          taxonomy       0.28      0.35      0.31        74
         treatment       0.81      0.66      0.73      1070

         micro avg       0.74      0.65      0.69      8842
         macro avg       0.60      0.58      0.57      8842
      weighted avg       0.82      0.65      0.71      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.2423
                    precision    recall  f1-score   support

           anatomy       0.89      0.65      0.75      1319
       brain_atlas       0.13      0.17      0.15        63
         cognitive       0.55      0.61      0.58       648
           disease       0.98      0.77      0.86      1661
              gene       0.58      0.80      0.67       304
medical_procedures       0.88      0.65      0.75      1381
          metadata       0.13      0.29      0.18        70
          molecule       0.62      0.65      0.63       512
         phenotype       0.92      0.62      0.74      1493
           protein       0.45      0.65      0.53       247
          taxonomy       0.21      0.38      0.27        74
         treatment       0.80      0.73      0.76      1070

         micro avg       0.77      0.67      0.72      8842
         macro avg       0.59      0.58      0.57      8842
      weighted avg       0.82      0.67      0.73      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:10<00:00, 12.02it/s]


Train Loss: 0.2083
                    precision    recall  f1-score   support

           anatomy       0.87      0.77      0.82      1319
       brain_atlas       0.09      0.27      0.13        63
         cognitive       0.54      0.59      0.57       648
           disease       0.99      0.80      0.88      1661
              gene       0.68      0.77      0.72       304
medical_procedures       0.88      0.78      0.83      1381
          metadata       0.15      0.24      0.18        70
          molecule       0.57      0.74      0.64       512
         phenotype       0.92      0.63      0.75      1493
           protein       0.47      0.61      0.53       247
          taxonomy       0.24      0.34      0.28        74
         treatment       0.82      0.68      0.74      1070

         micro avg       0.78      0.71      0.74      8842
         macro avg       0.60      0.60      0.59      8842
      weighted avg       0.82      0.71      0.76      8842
       samples avg 

Loss function -> Focal Loss

In [75]:
# Trying with FocalLoss
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train, X_val, y_train, y_val = train_test_split(df['input_text'], Y, test_size=0.2, random_state=42)

train_ds = PaperDataset(X_train.tolist(), y_train, tokenizer)
val_ds = PaperDataset(X_val.tolist(), y_val, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)

model = SpecterClassifier(n_labels=Y.shape[1]).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = FocalLoss(gamma=2.0)

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}")
    evaluate(model, val_loader, device)


Epoch 1


100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.96it/s]


Train Loss: 0.0978
                    precision    recall  f1-score   support

           anatomy       0.78      1.00      0.88      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.71      0.24      0.36       648
           disease       0.98      1.00      0.99      1661
              gene       0.88      0.54      0.67       304
medical_procedures       0.81      0.99      0.90      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.71      0.38      0.50       512
         phenotype       0.88      1.00      0.93      1493
           protein       0.63      0.24      0.35       247
          taxonomy       0.00      0.00      0.00        74
         treatment       0.73      0.91      0.81      1070

         micro avg       0.83      0.84      0.83      8842
         macro avg       0.59      0.52      0.53      8842
      weighted avg       0.80      0.84      0.80      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.93it/s]


Train Loss: 0.0888
                    precision    recall  f1-score   support

           anatomy       0.83      0.94      0.88      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.67      0.31      0.43       648
           disease       0.98      1.00      0.99      1661
              gene       0.78      0.61      0.68       304
medical_procedures       0.82      0.97      0.89      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.68      0.53      0.59       512
         phenotype       0.88      0.99      0.93      1493
           protein       0.60      0.29      0.39       247
          taxonomy       0.53      0.12      0.20        74
         treatment       0.82      0.66      0.73      1070

         micro avg       0.85      0.81      0.83      8842
         macro avg       0.63      0.53      0.56      8842
      weighted avg       0.82      0.81      0.80      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.95it/s]


Train Loss: 0.0803
                    precision    recall  f1-score   support

           anatomy       0.83      0.94      0.88      1319
       brain_atlas       0.00      0.00      0.00        63
         cognitive       0.59      0.48      0.53       648
           disease       0.98      1.00      0.99      1661
              gene       0.79      0.68      0.73       304
medical_procedures       0.84      0.93      0.88      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.67      0.48      0.56       512
         phenotype       0.89      0.97      0.93      1493
           protein       0.61      0.28      0.38       247
          taxonomy       0.62      0.14      0.22        74
         treatment       0.77      0.86      0.81      1070

         micro avg       0.84      0.84      0.84      8842
         macro avg       0.63      0.56      0.58      8842
      weighted avg       0.81      0.84      0.82      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.94it/s]


Train Loss: 0.0698
                    precision    recall  f1-score   support

           anatomy       0.85      0.91      0.88      1319
       brain_atlas       0.12      0.02      0.03        63
         cognitive       0.56      0.62      0.59       648
           disease       0.98      1.00      0.99      1661
              gene       0.74      0.72      0.73       304
medical_procedures       0.85      0.93      0.89      1381
          metadata       0.00      0.00      0.00        70
          molecule       0.61      0.62      0.62       512
         phenotype       0.89      0.98      0.93      1493
           protein       0.59      0.44      0.51       247
          taxonomy       0.62      0.14      0.22        74
         treatment       0.76      0.88      0.81      1070

         micro avg       0.82      0.86      0.84      8842
         macro avg       0.63      0.60      0.60      8842
      weighted avg       0.81      0.86      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.90it/s]


Train Loss: 0.0576
                    precision    recall  f1-score   support

           anatomy       0.85      0.91      0.88      1319
       brain_atlas       0.14      0.05      0.07        63
         cognitive       0.58      0.55      0.56       648
           disease       0.98      1.00      0.99      1661
              gene       0.73      0.69      0.71       304
medical_procedures       0.85      0.90      0.88      1381
          metadata       0.50      0.01      0.03        70
          molecule       0.65      0.62      0.63       512
         phenotype       0.90      0.98      0.93      1493
           protein       0.53      0.51      0.52       247
          taxonomy       0.59      0.23      0.33        74
         treatment       0.80      0.76      0.78      1070

         micro avg       0.83      0.84      0.84      8842
         macro avg       0.67      0.60      0.61      8842
      weighted avg       0.82      0.84      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.94it/s]


Train Loss: 0.0456
                    precision    recall  f1-score   support

           anatomy       0.87      0.81      0.84      1319
       brain_atlas       0.14      0.05      0.07        63
         cognitive       0.53      0.62      0.57       648
           disease       0.98      1.00      0.99      1661
              gene       0.77      0.67      0.72       304
medical_procedures       0.86      0.89      0.87      1381
          metadata       0.75      0.04      0.08        70
          molecule       0.67      0.60      0.63       512
         phenotype       0.90      0.97      0.93      1493
           protein       0.58      0.55      0.56       247
          taxonomy       0.65      0.27      0.38        74
         treatment       0.75      0.86      0.80      1070

         micro avg       0.83      0.84      0.83      8842
         macro avg       0.70      0.61      0.62      8842
      weighted avg       0.82      0.84      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.97it/s]


Train Loss: 0.0346
                    precision    recall  f1-score   support

           anatomy       0.85      0.90      0.87      1319
       brain_atlas       0.18      0.05      0.07        63
         cognitive       0.56      0.64      0.59       648
           disease       0.98      1.00      0.99      1661
              gene       0.75      0.64      0.69       304
medical_procedures       0.86      0.91      0.88      1381
          metadata       0.27      0.09      0.13        70
          molecule       0.67      0.53      0.59       512
         phenotype       0.91      0.96      0.93      1493
           protein       0.57      0.55      0.56       247
          taxonomy       0.66      0.34      0.45        74
         treatment       0.73      0.90      0.81      1070

         micro avg       0.82      0.85      0.84      8842
         macro avg       0.66      0.62      0.63      8842
      weighted avg       0.82      0.85      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.92it/s]


Train Loss: 0.0265
                    precision    recall  f1-score   support

           anatomy       0.85      0.92      0.88      1319
       brain_atlas       0.17      0.08      0.11        63
         cognitive       0.58      0.61      0.60       648
           disease       0.98      1.00      0.99      1661
              gene       0.69      0.74      0.71       304
medical_procedures       0.87      0.88      0.87      1381
          metadata       0.26      0.11      0.16        70
          molecule       0.63      0.66      0.64       512
         phenotype       0.91      0.95      0.93      1493
           protein       0.61      0.51      0.55       247
          taxonomy       0.44      0.30      0.35        74
         treatment       0.78      0.80      0.79      1070

         micro avg       0.83      0.85      0.84      8842
         macro avg       0.65      0.63      0.63      8842
      weighted avg       0.82      0.85      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.95it/s]


Train Loss: 0.0197
                    precision    recall  f1-score   support

           anatomy       0.87      0.86      0.86      1319
       brain_atlas       0.21      0.10      0.13        63
         cognitive       0.56      0.61      0.58       648
           disease       0.98      1.00      0.99      1661
              gene       0.72      0.70      0.71       304
medical_procedures       0.86      0.92      0.89      1381
          metadata       0.37      0.16      0.22        70
          molecule       0.66      0.58      0.62       512
         phenotype       0.91      0.92      0.92      1493
           protein       0.58      0.51      0.54       247
          taxonomy       0.32      0.32      0.32        74
         treatment       0.77      0.81      0.79      1070

         micro avg       0.83      0.83      0.83      8842
         macro avg       0.65      0.62      0.63      8842
      weighted avg       0.82      0.83      0.83      8842
       samples avg 

100%|█████████████████████████████████████████████████████████████████████████| 851/851 [01:11<00:00, 11.94it/s]


Train Loss: 0.0156
                    precision    recall  f1-score   support

           anatomy       0.85      0.89      0.87      1319
       brain_atlas       0.21      0.06      0.10        63
         cognitive       0.56      0.58      0.57       648
           disease       0.98      1.00      0.99      1661
              gene       0.74      0.67      0.70       304
medical_procedures       0.86      0.91      0.88      1381
          metadata       0.20      0.16      0.17        70
          molecule       0.65      0.54      0.59       512
         phenotype       0.91      0.95      0.93      1493
           protein       0.61      0.51      0.56       247
          taxonomy       0.38      0.34      0.36        74
         treatment       0.78      0.77      0.78      1070

         micro avg       0.83      0.83      0.83      8842
         macro avg       0.64      0.62      0.62      8842
      weighted avg       0.82      0.83      0.83      8842
       samples avg 

In [41]:
torch.save(model.state_dict(), "specter_focal_finetuned_multilabel.pt")

# Trying using AutoModelForSequenceClassification

In [82]:
pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

# Load your data
df = pd.read_csv("abstracts_titles_wdomains.csv")

# Combine title and title_domains as input text
df["input_text"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

# Convert abstract_domains to binary multi-hot vectors
mlb = MultiLabelBinarizer()
df["abstract_domains"] = df["abstract_domains"].apply(eval)
Y = mlb.fit_transform(df["abstract_domains"])
labels = mlb.classes_

# Tokenizer and encoding
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Train-test split
X_train, X_val, Y_train, Y_val = train_test_split(df["input_text"], Y, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(X_train.tolist(), Y_train, tokenizer)
val_dataset = MultiLabelDataset(X_val.tolist(), Y_val, tokenizer)

# Load the model with multi-label setting
model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/specter",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir=None,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    logging_steps=200
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro")
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
predictions = trainer.predict(val_dataset)
y_pred = (torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5).int().numpy()
y_true = Y_val

print(classification_report(y_true, y_pred, target_names=labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/specter and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,0.3856
400,0.376
600,0.3672
800,0.3572
1000,0.3481
1200,0.3343
1400,0.3289
1600,0.3388
1800,0.3175
2000,0.295


                    precision    recall  f1-score   support

           anatomy       0.84      0.86      0.85      1311
       brain_atlas       0.20      0.11      0.15        70
         cognitive       0.60      0.58      0.59       656
           disease       0.98      1.00      0.99      1668
              gene       0.75      0.75      0.75       343
medical_procedures       0.85      0.89      0.87      1370
          metadata       0.28      0.13      0.18        83
          molecule       0.59      0.56      0.58       500
         phenotype       0.90      0.96      0.93      1493
           protein       0.53      0.46      0.49       233
          taxonomy       0.36      0.32      0.34        82
         treatment       0.75      0.80      0.77      1032

         micro avg       0.82      0.83      0.82      8841
         macro avg       0.64      0.62      0.62      8841
      weighted avg       0.81      0.83      0.82      8841
       samples avg       0.83      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

# Load your data
df = pd.read_csv("abstracts_titles_wdomains.csv")

# Combine title and title_domains as input text
df["input_text"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

# Convert abstract_domains to binary multi-hot vectors
mlb = MultiLabelBinarizer()
df["abstract_domains"] = df["abstract_domains"].apply(eval)
Y = mlb.fit_transform(df["abstract_domains"])
labels = mlb.classes_

# Tokenizer and encoding
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Train-test split
X_train, X_val, Y_train, Y_val = train_test_split(df["input_text"], Y, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(X_train.tolist(), Y_train, tokenizer)
val_dataset = MultiLabelDataset(X_val.tolist(), Y_val, tokenizer)

# Load the model with multi-label setting
model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir=None,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    logging_steps=200
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro")
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
predictions = trainer.predict(val_dataset)
y_pred = (torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5).int().numpy()
y_true = Y_val

print(classification_report(y_true, y_pred, target_names=labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,0.3899
400,0.3804
600,0.3684
800,0.3583
1000,0.348
1200,0.3396
1400,0.3335
1600,0.3419
1800,0.3235
2000,0.3034


                    precision    recall  f1-score   support

           anatomy       0.84      0.87      0.86      1311
       brain_atlas       0.19      0.10      0.13        70
         cognitive       0.57      0.57      0.57       656
           disease       0.98      1.00      0.99      1668
              gene       0.75      0.69      0.72       343
medical_procedures       0.85      0.88      0.86      1370
          metadata       0.21      0.11      0.14        83
          molecule       0.58      0.60      0.59       500
         phenotype       0.90      0.95      0.92      1493
           protein       0.50      0.46      0.48       233
          taxonomy       0.32      0.29      0.30        82
         treatment       0.75      0.79      0.77      1032

         micro avg       0.81      0.83      0.82      8841
         macro avg       0.62      0.61      0.61      8841
      weighted avg       0.80      0.83      0.81      8841
       samples avg       0.82      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

# Load your data
df = pd.read_csv("abstracts_titles_wdomains.csv")

# Combine title and title_domains as input text
df["input_text"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

# Convert abstract_domains to binary multi-hot vectors
mlb = MultiLabelBinarizer()
df["abstract_domains"] = df["abstract_domains"].apply(eval)
Y = mlb.fit_transform(df["abstract_domains"])
labels = mlb.classes_

# Tokenizer and encoding
tokenizer = AutoTokenizer.from_pretrained("allenai/specter_plus_plus")

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Train-test split
X_train, X_val, Y_train, Y_val = train_test_split(df["input_text"], Y, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(X_train.tolist(), Y_train, tokenizer)
val_dataset = MultiLabelDataset(X_val.tolist(), Y_val, tokenizer)

# Load the model with multi-label setting
model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/specter_plus_plus",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir=None,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_steps=200
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro")
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
predictions = trainer.predict(val_dataset)
y_pred = (torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5).int().numpy()
y_true = Y_val

print(classification_report(y_true, y_pred, target_names=labels))


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/specter_plus_plus and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,0.3912
400,0.3783
600,0.3676
800,0.3575
1000,0.3477
1200,0.3368
1400,0.331
1600,0.3373
1800,0.3196
2000,0.2985


                    precision    recall  f1-score   support

           anatomy       0.84      0.88      0.86      1311
       brain_atlas       0.19      0.09      0.12        70
         cognitive       0.56      0.53      0.54       656
           disease       0.97      1.00      0.99      1668
              gene       0.74      0.69      0.71       343
medical_procedures       0.85      0.90      0.87      1370
          metadata       0.30      0.07      0.12        83
          molecule       0.61      0.56      0.58       500
         phenotype       0.90      0.96      0.93      1493
           protein       0.51      0.43      0.47       233
          taxonomy       0.41      0.29      0.34        82
         treatment       0.74      0.79      0.77      1032

         micro avg       0.82      0.83      0.82      8841
         macro avg       0.64      0.60      0.61      8841
      weighted avg       0.80      0.83      0.81      8841
       samples avg       0.83      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

# Load your data
df = pd.read_csv("abstracts_titles_wdomains.csv")

# Combine title and title_domains as input text
df["input_text"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

# Convert abstract_domains to binary multi-hot vectors
mlb = MultiLabelBinarizer()
df["abstract_domains"] = df["abstract_domains"].apply(eval)
Y = mlb.fit_transform(df["abstract_domains"])
labels = mlb.classes_

# Tokenizer and encoding
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Train-test split
X_train, X_val, Y_train, Y_val = train_test_split(df["input_text"], Y, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(X_train.tolist(), Y_train, tokenizer)
val_dataset = MultiLabelDataset(X_val.tolist(), Y_val, tokenizer)

# Load the model with multi-label setting
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir=None,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_steps=200
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro")
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
predictions = trainer.predict(val_dataset)
y_pred = (torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5).int().numpy()
y_true = Y_val

print(classification_report(y_true, y_pred, target_names=labels))


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,0.3933
400,0.3762
600,0.3624
800,0.3539
1000,0.3405
1200,0.3314
1400,0.3229
1600,0.3315
1800,0.3106
2000,0.2859


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

                    precision    recall  f1-score   support

           anatomy       0.83      0.89      0.86      1311
       brain_atlas       0.32      0.09      0.13        70
         cognitive       0.58      0.57      0.58       656
           disease       0.97      1.00      0.99      1668
              gene       0.81      0.69      0.75       343
medical_procedures       0.85      0.92      0.88      1370
          metadata       0.50      0.08      0.14        83
          molecule       0.64      0.55      0.59       500
         phenotype       0.90      0.96      0.93      1493
           protein       0.56      0.46      0.51       233
          taxonomy       0.51      0.30      0.38        82
         treatment       0.75      0.81      0.78      1032

         micro avg       0.83      0.84      0.83      8841
         macro avg       0.68      0.61      0.63      8841
      weighted avg       0.81      0.84      0.82      8841
       samples avg       0.83      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Ensemble model of RF and XGB using specter for embedding

In [9]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

df = pd.read_csv("abstracts_titles_wdomains.csv")
df = df.dropna(subset=["title", "title_domains", "abstract_domains"])

import ast
df["title_domains"] = df["title_domains"].apply(ast.literal_eval)
df["abstract_domains"] = df["abstract_domains"].apply(ast.literal_eval)

mlb_title = MultiLabelBinarizer()
mlb_abstract = MultiLabelBinarizer()
title_domains_bin = mlb_title.fit_transform(df["title_domains"])
abstract_domains_bin = mlb_abstract.fit_transform(df["abstract_domains"])
abstract_labels = mlb_abstract.classes_

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
specter = AutoModel.from_pretrained("allenai/specter").to(device)
specter.eval()

def get_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        outputs = specter(**inputs)
        return outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()

print("Encoding titles...")
title_embeddings = np.vstack([get_embedding(t) for t in df["title"]])

X = np.hstack((title_embeddings, title_domains_bin))
Y = abstract_domains_bin

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Training Random Forest...")
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
rf.fit(X_train, y_train)

print("Training XGBoost...")
xgb = MultiOutputClassifier(XGBClassifier(n_estimators=200, eval_metric='logloss', use_label_encoder=False, random_state=42))
xgb.fit(X_train, y_train)

rf_probs = np.vstack([est.predict_proba(X_test)[:, 1] for est in rf.estimators_]).T
xgb_probs = np.vstack([est.predict_proba(X_test)[:, 1] for est in xgb.estimators_]).T

ensemble_probs = (rf_probs + xgb_probs) / 2
ensemble_preds = (ensemble_probs >= 0.5).astype(int)

print("\n=== Classification Report ===")
print(classification_report(y_test, ensemble_preds, target_names=abstract_labels))


Encoding titles...
Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



=== Classification Report ===
                    precision    recall  f1-score   support

           anatomy       0.83      0.95      0.88      1311
       brain_atlas       1.00      0.07      0.13        70
         cognitive       0.68      0.45      0.54       656
           disease       0.97      1.00      0.99      1668
              gene       0.88      0.60      0.72       343
medical_procedures       0.83      0.98      0.90      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.70      0.46      0.55       500
         phenotype       0.89      1.00      0.94      1493
           protein       0.72      0.30      0.43       233
          taxonomy       0.88      0.18      0.30        82
         treatment       0.75      0.85      0.80      1032

         micro avg       0.85      0.84      0.84      8841
         macro avg       0.84      0.58      0.61      8841
      weighted avg       0.84      0.84      0.82      8841
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Ensemble of RF, XGB and fine-tuned specter

In [12]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv("abstracts_titles_wdomains.csv")
df["combined_input"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["abstract_domains"].apply(eval))

X_train, X_test, y_train, y_test = train_test_split(df["combined_input"], Y, test_size=0.2, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
specter_encoder = AutoModel.from_pretrained("allenai/specter").to(device)

def get_specter_embeddings(texts, batch_size=8):
    all_embeddings = []
    specter_encoder.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = specter_encoder(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

print("Embedding the data using specter")
X_train_embeds = get_specter_embeddings(X_train.tolist())
X_test_embeds = get_specter_embeddings(X_test.tolist())

print("Training the RF model")
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf.fit(X_train_embeds, y_train)
print("Training the XGBoost model")
xgb = MultiOutputClassifier(XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42))
xgb.fit(X_train_embeds, y_train)

rf_preds = np.stack([est.predict_proba(X_test_embeds)[:, 1] for est in rf.estimators_], axis=1)
xgb_preds = np.stack([est.predict_proba(X_test_embeds)[:, 1] for est in xgb.estimators_], axis=1)

print("Fine-tuned allenai/specter for multilabel classification")
specter_clf = AutoModelForSequenceClassification.from_pretrained(
    "allenai/specter",  # update path
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
).to(device)
specter_clf.eval()

class SpecterDataset(Dataset):
    def __init__(self, texts):
        self.encodings = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="pt")
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

def get_classifier_preds(model, texts):
    dataset = SpecterDataset(texts)
    loader = DataLoader(dataset, batch_size=8)
    preds = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch)
            logits = output.logits
            probs = torch.sigmoid(logits).cpu().numpy()
            preds.append(probs)
    return np.vstack(preds)

specter_preds = get_classifier_preds(specter_clf, X_test)

avg_preds = (specter_preds + rf_preds + xgb_preds) / 3
final_preds = (avg_preds >= 0.5).astype(int)

print(classification_report(y_test, final_preds, target_names=mlb.classes_))
print("Micro F1:", f1_score(y_test, final_preds, average='micro'))
print("Macro F1:", f1_score(y_test, final_preds, average='macro'))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embedding the data using specter
Training the RF model
Training the XGBoost model


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fine-tuned allenai/specter for multilabel classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/specter and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                    precision    recall  f1-score   support

           anatomy       0.82      0.96      0.88      1311
       brain_atlas       0.83      0.07      0.13        70
         cognitive       0.67      0.48      0.56       656
           disease       0.97      1.00      0.99      1668
              gene       0.85      0.57      0.68       343
medical_procedures       0.83      0.97      0.89      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.71      0.40      0.51       500
         phenotype       0.89      1.00      0.94      1493
           protein       0.66      0.30      0.41       233
          taxonomy       0.93      0.17      0.29        82
         treatment       0.73      0.83      0.78      1032

         micro avg       0.84      0.84      0.84      8841
         macro avg       0.82      0.57      0.60      8841
      weighted avg       0.83      0.84      0.82      8841
       samples avg       0.84      0.8

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Ensemble model with Logistic Regression as meta learner

In [13]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression # Import Logistic Regression

df = pd.read_csv("abstracts_titles_wdomains.csv")
df["combined_input"] = df["title"] + " " + df["title_domains"].apply(lambda x: " ".join(eval(x)))

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["abstract_domains"].apply(eval))

X_train, X_test, y_train, y_test = train_test_split(df["combined_input"], Y, test_size=0.2, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
specter_encoder = AutoModel.from_pretrained("allenai/specter").to(device)

def get_specter_embeddings(texts, batch_size=8):
    all_embeddings = []
    specter_encoder.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = specter_encoder(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

print("Embedding the data using specter")
X_train_embeds = get_specter_embeddings(X_train.tolist())
X_test_embeds = get_specter_embeddings(X_test.tolist())

print("Training the RF model")
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf.fit(X_train_embeds, y_train)
print("Training the XGBoost model")
xgb = MultiOutputClassifier(XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42))
xgb.fit(X_train_embeds, y_train)

rf_train_preds_proba = np.stack([est.predict_proba(X_train_embeds)[:, 1] for est in rf.estimators_], axis=1)
rf_test_preds_proba = np.stack([est.predict_proba(X_test_embeds)[:, 1] for est in rf.estimators_], axis=1)

xgb_train_preds_proba = np.stack([est.predict_proba(X_train_embeds)[:, 1] for est in xgb.estimators_], axis=1)
xgb_test_preds_proba = np.stack([est.predict_proba(X_test_embeds)[:, 1] for est in xgb.estimators_], axis=1)

print("Fine-tuned allenai/specter for multilabel classification")
specter_clf = AutoModelForSequenceClassification.from_pretrained(
    "allenai/specter",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
).to(device)
specter_clf.eval()

class SpecterDataset(Dataset):
    def __init__(self, texts):
        self.encodings = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="pt")
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

def get_classifier_preds_proba(model, texts):
    dataset = SpecterDataset(texts)
    loader = DataLoader(dataset, batch_size=8)
    preds = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch)
            logits = output.logits
            probs = torch.sigmoid(logits).cpu().numpy()
            preds.append(probs)
    return np.vstack(preds)

specter_train_preds_proba = get_classifier_preds_proba(specter_clf, X_train)
specter_test_preds_proba = get_classifier_preds_proba(specter_clf, X_test)


print("Training Logistic Regression meta-learner")

X_meta_train = np.hstack([specter_train_preds_proba, rf_train_preds_proba, xgb_train_preds_proba])
X_meta_test = np.hstack([specter_test_preds_proba, rf_test_preds_proba, xgb_test_preds_proba])

meta_learner = MultiOutputClassifier(LogisticRegression(solver='liblinear', random_state=42))
meta_learner.fit(X_meta_train, y_train)

final_preds_proba = meta_learner.predict_proba(X_meta_test) 
final_preds = np.array([prob[:, 1] for prob in final_preds_proba]).T 
final_preds = (final_preds >= 0.5).astype(int)


print("\n--- Evaluation with Logistic Regression Meta-Learner ---")
print(classification_report(y_test, final_preds, target_names=mlb.classes_))
print("Micro F1:", f1_score(y_test, final_preds, average='micro'))
print("Macro F1:", f1_score(y_test, final_preds, average='macro'))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embedding the data using specter
Training the RF model
Training the XGBoost model


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fine-tuned allenai/specter for multilabel classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/specter and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Logistic Regression meta-learner

--- Evaluation with Logistic Regression Meta-Learner ---
                    precision    recall  f1-score   support

           anatomy       0.82      0.95      0.88      1311
       brain_atlas       1.00      0.07      0.13        70
         cognitive       0.66      0.48      0.56       656
           disease       0.97      1.00      0.99      1668
              gene       0.85      0.57      0.68       343
medical_procedures       0.83      0.96      0.89      1370
          metadata       1.00      0.08      0.16        83
          molecule       0.68      0.43      0.53       500
         phenotype       0.89      1.00      0.94      1493
           protein       0.69      0.28      0.40       233
          taxonomy       0.93      0.17      0.29        82
         treatment       0.73      0.84      0.78      1032

         micro avg       0.84      0.84      0.84      8841
         macro avg       0.84      0.57      0.60      884

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Using BiLSTM

In [68]:
!pip install scikit-multilearn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
[0m

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import copy
from sklearn.metrics import roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SpecterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, input_dim)
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])
        return self.fc(out)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        pt = torch.sigmoid(inputs)
        pt = pt * targets + (1 - pt) * (1 - targets)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()

X = np.hstack((title_embeddings, title_domain_features))
Y = abstract_domain_targets

from skmultilearn.model_selection import iterative_train_test_split

X_train, Y_train, X_val, Y_val = iterative_train_test_split(X, Y, test_size=0.2)

train_dataset = SpecterDataset(X_train, Y_train)
val_dataset = SpecterDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = BiLSTMClassifier(input_dim=X.shape[1], hidden_dim=128, num_labels=Y.shape[1]).to(device)
criterion = FocalLoss(alpha=1.0, gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20)

best_f1 = 0
epochs_no_improve = 0
early_stop_patience = 10
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int().cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())

    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    val_macro_f1 = f1_score(y_true, y_pred, average='macro')

    # scheduler.step(val_macro_f1)

    print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}, Val Macro F1: {val_macro_f1:.4f}")

    if val_macro_f1 > best_f1:
        best_f1 = val_macro_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    # if epochs_no_improve >= early_stop_patience:
    #     print(f"Early stopping at epoch {epoch+1}")
    #     break

model.load_state_dict(best_model_wts)

all_labels = []
y_probs = []

model.eval()
with torch.no_grad():
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = torch.sigmoid(logits)
        y_probs.append(probs.cpu().numpy())
        all_labels.append(yb.cpu().numpy())

y_true = np.vstack(all_labels)
y_probs = np.vstack(y_probs)
y_pred = (y_probs > 0.5).astype(int)  # Add this line

# Compute ROC AUC
from sklearn.metrics import roc_auc_score

roc_auc_micro = roc_auc_score(y_true, y_probs, average='micro')
roc_auc_macro = roc_auc_score(y_true, y_probs, average='macro')

print(f"\nROC AUC (Micro): {roc_auc_micro:.4f}")
print(f"ROC AUC (Macro): {roc_auc_macro:.4f}")

# Classification report
from sklearn.metrics import classification_report

print("\n=== Final Validation Report ===")
print(classification_report(y_true, y_pred, target_names=domain_classes, zero_division=0))



Using device: cuda
Epoch 1, Train Loss: 0.1091, Val Macro F1: 0.4528
Epoch 2, Train Loss: 0.0941, Val Macro F1: 0.4980
Epoch 3, Train Loss: 0.0919, Val Macro F1: 0.4983
Epoch 4, Train Loss: 0.0906, Val Macro F1: 0.5062
Epoch 5, Train Loss: 0.0897, Val Macro F1: 0.5236
Epoch 6, Train Loss: 0.0885, Val Macro F1: 0.5210
Epoch 7, Train Loss: 0.0879, Val Macro F1: 0.5324
Epoch 8, Train Loss: 0.0871, Val Macro F1: 0.5354
Epoch 9, Train Loss: 0.0862, Val Macro F1: 0.5396
Epoch 10, Train Loss: 0.0859, Val Macro F1: 0.5487
Epoch 11, Train Loss: 0.0852, Val Macro F1: 0.5447
Epoch 12, Train Loss: 0.0846, Val Macro F1: 0.5461
Epoch 13, Train Loss: 0.0842, Val Macro F1: 0.5523
Epoch 14, Train Loss: 0.0836, Val Macro F1: 0.5449
Epoch 15, Train Loss: 0.0832, Val Macro F1: 0.5444
Epoch 16, Train Loss: 0.0825, Val Macro F1: 0.5580
Epoch 17, Train Loss: 0.0819, Val Macro F1: 0.5552
Epoch 18, Train Loss: 0.0814, Val Macro F1: 0.5578
Epoch 19, Train Loss: 0.0812, Val Macro F1: 0.5588
Epoch 20, Train Loss:

In [75]:
torch.save(model.state_dict(), "bilstm_multilabel_model.pth")
print("Model saved to bilstm_multilabel_model.pth")

Model saved to bilstm_multilabel_model.pth


In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import copy
from sklearn.metrics import roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SpecterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, input_dim)
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])
        return self.fc(out)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        pt = torch.sigmoid(inputs)
        pt = pt * targets + (1 - pt) * (1 - targets)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()

X = np.hstack((title_embeddings, title_domain_features))
Y = abstract_domain_targets

from skmultilearn.model_selection import iterative_train_test_split

X_train, Y_train, X_val, Y_val = iterative_train_test_split(X, Y, test_size=0.2)

train_dataset = SpecterDataset(X_train, Y_train)
val_dataset = SpecterDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = BiLSTMClassifier(input_dim=X.shape[1], hidden_dim=128, num_labels=Y.shape[1]).to(device)
criterion = FocalLoss(alpha=1.0, gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20)

best_f1 = 0
epochs_no_improve = 0
early_stop_patience = 20
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int().cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())

    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    val_macro_f1 = f1_score(y_true, y_pred, average='macro')

    # scheduler.step(val_macro_f1)

    print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}, Val Macro F1: {val_macro_f1:.4f}")

    if val_macro_f1 > best_f1:
        best_f1 = val_macro_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_model_wts)

all_labels = []
y_probs = []

model.eval()
with torch.no_grad():
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = torch.sigmoid(logits)
        y_probs.append(probs.cpu().numpy())
        all_labels.append(yb.cpu().numpy())

y_true = np.vstack(all_labels)
y_probs = np.vstack(y_probs)
y_pred = (y_probs > 0.5).astype(int)  # Add this line

# Compute ROC AUC
from sklearn.metrics import roc_auc_score

roc_auc_micro = roc_auc_score(y_true, y_probs, average='micro')
roc_auc_macro = roc_auc_score(y_true, y_probs, average='macro')

print(f"\nROC AUC (Micro): {roc_auc_micro:.4f}")
print(f"ROC AUC (Macro): {roc_auc_macro:.4f}")

# Classification report
from sklearn.metrics import classification_report

print("\n=== Final Validation Report ===")
print(classification_report(y_true, y_pred, target_names=domain_classes, zero_division=0))



Using device: cuda
Epoch 1, Train Loss: 0.1071, Val Macro F1: 0.4585
Epoch 2, Train Loss: 0.0944, Val Macro F1: 0.4970
Epoch 3, Train Loss: 0.0919, Val Macro F1: 0.5183
Epoch 4, Train Loss: 0.0904, Val Macro F1: 0.5193
Epoch 5, Train Loss: 0.0893, Val Macro F1: 0.5227
Epoch 6, Train Loss: 0.0885, Val Macro F1: 0.5260
Epoch 7, Train Loss: 0.0878, Val Macro F1: 0.5416
Epoch 8, Train Loss: 0.0871, Val Macro F1: 0.5381
Epoch 9, Train Loss: 0.0865, Val Macro F1: 0.5383
Epoch 10, Train Loss: 0.0860, Val Macro F1: 0.5523
Epoch 11, Train Loss: 0.0850, Val Macro F1: 0.5530
Epoch 12, Train Loss: 0.0848, Val Macro F1: 0.5550
Epoch 13, Train Loss: 0.0841, Val Macro F1: 0.5463
Epoch 14, Train Loss: 0.0836, Val Macro F1: 0.5598
Epoch 15, Train Loss: 0.0833, Val Macro F1: 0.5553
Epoch 16, Train Loss: 0.0828, Val Macro F1: 0.5543
Epoch 17, Train Loss: 0.0823, Val Macro F1: 0.5596
Epoch 18, Train Loss: 0.0817, Val Macro F1: 0.5534
Epoch 19, Train Loss: 0.0813, Val Macro F1: 0.5498
Epoch 20, Train Loss:

In [92]:
torch.save(model.state_dict(), "bilstm_multilabel_wes_model.pth")
print("Model saved to bilstm_multilabel_wes_model.pth")

Model saved to bilstm_multilabel_wes_model.pth


In [97]:
# With the learning scheduler and early stopping of 15
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import copy
from sklearn.metrics import roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SpecterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, input_dim)
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])
        return self.fc(out)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        pt = torch.sigmoid(inputs)
        pt = pt * targets + (1 - pt) * (1 - targets)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()

X = np.hstack((title_embeddings, title_domain_features))
Y = abstract_domain_targets

from skmultilearn.model_selection import iterative_train_test_split

X_train, Y_train, X_val, Y_val = iterative_train_test_split(X, Y, test_size=0.2)

train_dataset = SpecterDataset(X_train, Y_train)
val_dataset = SpecterDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = BiLSTMClassifier(input_dim=X.shape[1], hidden_dim=128, num_labels=Y.shape[1]).to(device)
criterion = FocalLoss(alpha=1.0, gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20)

best_f1 = 0
epochs_no_improve = 0
early_stop_patience = 15
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int().cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())

    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    val_macro_f1 = f1_score(y_true, y_pred, average='macro')

    scheduler.step(val_macro_f1)

    print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}, Val Macro F1: {val_macro_f1:.4f}")

    if val_macro_f1 > best_f1:
        best_f1 = val_macro_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_model_wts)

all_labels = []
y_probs = []

model.eval()
with torch.no_grad():
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = torch.sigmoid(logits)
        y_probs.append(probs.cpu().numpy())
        all_labels.append(yb.cpu().numpy())

y_true = np.vstack(all_labels)
y_probs = np.vstack(y_probs)
y_pred = (y_probs > 0.5).astype(int)  

# Compute ROC AUC
from sklearn.metrics import roc_auc_score

roc_auc_micro = roc_auc_score(y_true, y_probs, average='micro')
roc_auc_macro = roc_auc_score(y_true, y_probs, average='macro')

print(f"\nROC AUC (Micro): {roc_auc_micro:.4f}")
print(f"ROC AUC (Macro): {roc_auc_macro:.4f}")

# Classification report
from sklearn.metrics import classification_report

print("\n=== Final Validation Report ===")
print(classification_report(y_true, y_pred, target_names=domain_classes, zero_division=0))



Using device: cuda
Epoch 1, Train Loss: 0.1087, Val Macro F1: 0.4785
Epoch 2, Train Loss: 0.0948, Val Macro F1: 0.4925
Epoch 3, Train Loss: 0.0923, Val Macro F1: 0.5148
Epoch 4, Train Loss: 0.0906, Val Macro F1: 0.5157
Epoch 5, Train Loss: 0.0900, Val Macro F1: 0.5375
Epoch 6, Train Loss: 0.0889, Val Macro F1: 0.5239
Epoch 7, Train Loss: 0.0879, Val Macro F1: 0.5199
Epoch 8, Train Loss: 0.0875, Val Macro F1: 0.5273
Epoch 9, Train Loss: 0.0867, Val Macro F1: 0.5412
Epoch 10, Train Loss: 0.0860, Val Macro F1: 0.5296
Epoch 11, Train Loss: 0.0857, Val Macro F1: 0.5390
Epoch 12, Train Loss: 0.0849, Val Macro F1: 0.5479
Epoch 13, Train Loss: 0.0843, Val Macro F1: 0.5371
Epoch 14, Train Loss: 0.0841, Val Macro F1: 0.5492
Epoch 15, Train Loss: 0.0834, Val Macro F1: 0.5587
Epoch 16, Train Loss: 0.0829, Val Macro F1: 0.5596
Epoch 17, Train Loss: 0.0825, Val Macro F1: 0.5553
Epoch 18, Train Loss: 0.0817, Val Macro F1: 0.5571
Epoch 19, Train Loss: 0.0815, Val Macro F1: 0.5545
Epoch 20, Train Loss:

In [98]:
torch.save(model.state_dict(), "bilstm_multilabel_wes15_sch_model.pth")
print("Model saved to bilstm_multilabel_wes15_sch_model.pth")

Model saved to bilstm_multilabel_wes15_sch_model.pth


In [103]:
# With the learning scheduler and early stopping of 15
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import copy
from sklearn.metrics import roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SpecterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, input_dim)
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])
        return self.fc(out)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        pt = torch.sigmoid(inputs)
        pt = pt * targets + (1 - pt) * (1 - targets)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean() if self.reduction == 'mean' else loss.sum()

X = np.hstack((title_embeddings, title_domain_features))
Y = abstract_domain_targets

from skmultilearn.model_selection import iterative_train_test_split

X_train, Y_train, X_val, Y_val = iterative_train_test_split(X, Y, test_size=0.2)

train_dataset = SpecterDataset(X_train, Y_train)
val_dataset = SpecterDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = BiLSTMClassifier(input_dim=X.shape[1], hidden_dim=128, num_labels=Y.shape[1]).to(device)
criterion = FocalLoss(alpha=1.0, gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=15)

best_f1 = 0
epochs_no_improve = 0
early_stop_patience = 15
best_model_wts = copy.deepcopy(model.state_dict())

for epoch in range(100):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int().cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())

    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    val_macro_f1 = f1_score(y_true, y_pred, average='macro')

    scheduler.step(val_macro_f1)

    print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}, Val Macro F1: {val_macro_f1:.4f}")

    if val_macro_f1 > best_f1:
        best_f1 = val_macro_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_model_wts)

all_labels = []
y_probs = []

model.eval()
with torch.no_grad():
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = torch.sigmoid(logits)
        y_probs.append(probs.cpu().numpy())
        all_labels.append(yb.cpu().numpy())

y_true = np.vstack(all_labels)
y_probs = np.vstack(y_probs)
y_pred = (y_probs > 0.5).astype(int)  

# Compute ROC AUC
from sklearn.metrics import roc_auc_score

roc_auc_micro = roc_auc_score(y_true, y_probs, average='micro')
roc_auc_macro = roc_auc_score(y_true, y_probs, average='macro')

print(f"\nROC AUC (Micro): {roc_auc_micro:.4f}")
print(f"ROC AUC (Macro): {roc_auc_macro:.4f}")

# Classification report
from sklearn.metrics import classification_report

print("\n=== Final Validation Report ===")
print(classification_report(y_true, y_pred, target_names=domain_classes, zero_division=0))



Using device: cuda
Epoch 1, Train Loss: 0.1076, Val Macro F1: 0.4499
Epoch 2, Train Loss: 0.0944, Val Macro F1: 0.4941
Epoch 3, Train Loss: 0.0921, Val Macro F1: 0.5122
Epoch 4, Train Loss: 0.0905, Val Macro F1: 0.5191
Epoch 5, Train Loss: 0.0896, Val Macro F1: 0.5169
Epoch 6, Train Loss: 0.0888, Val Macro F1: 0.5364
Epoch 7, Train Loss: 0.0877, Val Macro F1: 0.5328
Epoch 8, Train Loss: 0.0871, Val Macro F1: 0.5447
Epoch 9, Train Loss: 0.0864, Val Macro F1: 0.5370
Epoch 10, Train Loss: 0.0855, Val Macro F1: 0.5423
Epoch 11, Train Loss: 0.0851, Val Macro F1: 0.5385
Epoch 12, Train Loss: 0.0847, Val Macro F1: 0.5503
Epoch 13, Train Loss: 0.0842, Val Macro F1: 0.5536
Epoch 14, Train Loss: 0.0836, Val Macro F1: 0.5486
Epoch 15, Train Loss: 0.0832, Val Macro F1: 0.5478
Epoch 16, Train Loss: 0.0825, Val Macro F1: 0.5559
Epoch 17, Train Loss: 0.0822, Val Macro F1: 0.5497
Epoch 18, Train Loss: 0.0813, Val Macro F1: 0.5497
Epoch 19, Train Loss: 0.0810, Val Macro F1: 0.5565
Epoch 20, Train Loss: