In [None]:
import os
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import numpy as np
from transformers import pipeline
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer,AutoModelForSequenceClassification,AutoTokenizer
from tqdm import tqdm

pd.set_option("max_columns", 300)
pd.set_option('display.max_colwidth', None)
os.chdir(os.getcwd().replace('notebooks','').replace('medi_crawler',''))

In [None]:
import config as CONFIG


In [93]:
import torch
import torch.nn.functional as F

MODEL_NAME = "pritamdeka/PubMedBert-PubMed200kRCT"


BATCH_SIZE = 8
N_EPOCHS = 50
MAX_LEN = 256

from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame called 'df' with features and labels
print("Load Data")
X = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/labeled_data.csv")['Title'])
y = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/labeled_data.csv")['label'])

bool_not_nan = ~X['Title'].isna()
X = X[bool_not_nan]
y = y[bool_not_nan]
y = y.astype(int)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to list of strings
X_train = X_train.squeeze().tolist()
X_test = X_test.squeeze().tolist()


import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

# Step 1: Data Preparation
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])  # Convert label to int
        
        # Tokenize the text
        tokens = tokenizer.tokenize(text)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        
        return input_ids, attention_mask, label

# Step 2: Dataset Creation
dataset = CustomDataset(X_train,  y_train['label'].values)



# Step 3: DataLoader Setup

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids_batch, attention_mask_batch, labels_batch = zip(*batch)
    
    # Convert lists to tensors
    input_ids_batch = [torch.tensor(ids) for ids in input_ids_batch]
    attention_mask_batch = [torch.tensor(mask) for mask in attention_mask_batch]
    
    # Pad or truncate input sequences to a fixed length
    input_ids_batch = pad_sequence(input_ids_batch, batch_first=True, padding_value=0)
    attention_mask_batch = pad_sequence(attention_mask_batch, batch_first=True, padding_value=0)
    
    return input_ids_batch, attention_mask_batch, torch.tensor(labels_batch)



batch_size = BATCH_SIZE

# Create the data loader with the custom collate function
print("Create the data loader with the custom collate function")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)


# Step 4: Model Training Loop
print("Model Training Loop")
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = torch.optim.Adam(model.parameters())
# criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCEWithLogitsLoss()

num_epochs = N_EPOCHS
from sklearn.metrics import recall_score

# ...

model.train()
for epoch in tqdm(range(num_epochs)):
    for batch_idx, batch in enumerate(dataloader):
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to probabilities using softmax
        probabilities = F.softmax(logits, dim=1)

        # Calculate recall as a custom loss
        recall = recall_score(labels.cpu(), torch.argmax(probabilities, dim=1).cpu(), average='macro', zero_division=0)
        recall_loss = 1 - recall  # Use 1 - recall as the loss

        # Backpropagation and optimization
        recall_loss_tensor = torch.tensor(recall_loss, requires_grad=True)  # Convert to a PyTorch tensor
        recall_loss_tensor.backward()
        optimizer.step()

        # Print or store the recall value
        # print(f"Epoch {epoch+1}, Batch {batch_idx+1}, Recall: {recall}")


Load Data
Create the data loader with the custom collate function
Model Training Loop


100%|██████████| 50/50 [12:19<00:00, 14.78s/it]


In [103]:
import torch.nn.functional as F

# Create a CustomDataset for the test dataset
test_dataset = CustomDataset(X_test, y_test['label'].values)

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Evaluate the model on the test dataset
model.eval()
predicted_scores = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, _ = batch  # Discard labels since they are None

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        scores = F.softmax(logits, dim=1)
        
        # Extract the probability of the positive class (index 1)
        positive_scores = scores[:, 1]

        predicted_scores.extend(positive_scores.tolist())

# Convert the predicted scores to a NumPy array for further analysis
predicted_scores = np.round(np.array(predicted_scores),2)

from sklearn.metrics import classification_report

# Assuming you have the predicted labels 'y_pred' and the true labels 'y_true'
report = classification_report(y_test, predicted_scores>0)

print(report)

              precision    recall  f1-score   support

           0       0.78      0.40      0.53        35
           1       0.30      0.69      0.42        13

    accuracy                           0.48        48
   macro avg       0.54      0.55      0.47        48
weighted avg       0.65      0.48      0.50        48



### predict new unlabeled data

In [99]:
np.round(predicted_scores,3)

array([0.   , 0.25 , 0.001, 0.257, 0.008, 0.08 , 0.016, 0.555, 0.   ,
       0.004, 0.001, 0.061, 0.002, 0.019, 0.002, 0.911, 0.001, 0.011,
       0.328, 0.007, 0.011, 0.938, 0.02 , 0.   , 0.994, 0.018, 0.   ,
       0.014, 0.027, 0.197, 0.   , 0.   , 0.   , 0.005, 0.022, 0.002,
       0.687, 0.005, 0.986, 0.004, 0.01 , 0.007, 0.105, 0.003, 0.116,
       0.013, 0.481, 0.95 ])

In [None]:
X_new = X_new_df.squeeze().values

encodings = tokenizer(X_new.squeeze().tolist(), 
                      max_length=MAX_LEN, 
                      truncation=True, 
                      padding=True)
# Transform to tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))

# Predict
preds = model.predict(dataset).logits

preds = tf.nn.softmax(preds, axis=1).numpy()  

In [None]:
output = pd.concat([X_new_df,pd.DataFrame(preds[:,1],index=X_new_df.index,columns=['pred'])],axis=1).sort_values('pred',ascending=False)
output.head()

In [None]:
output.to_csv('data/medi_crawler/final/predict_label_title_recall.csv')