In [1]:
import os
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import numpy as np
from transformers import pipeline

pd.set_option("max_columns", 300)
pd.set_option('display.max_colwidth', None)
os.chdir(os.getcwd().replace('notebooks','').replace('medi_crawler',''))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import config as CONFIG


# Dataset

In [3]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame called 'df' with features and labels
X = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/labeled_data.csv")['Title'])
y = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/labeled_data.csv")['label'])

bool_not_nan = ~X['Title'].isna()
X = X[bool_not_nan]
y = y[bool_not_nan]


# Model

In [13]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
MODEL_NAME = 'distilbert-base-uncased'

BATCH_SIZE = 8
N_EPOCHS = 30
MAX_LEN = 256

In [14]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to list of strings
X_train = X_train.squeeze().tolist()
X_test = X_test.squeeze().tolist()


In [15]:
# Define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
# Tokenize the text
train_encodings = tokenizer(X_train,
                            truncation=True,
                            padding=True,
                            max_length=MAX_LEN,
                            return_tensors='tf')  # Convert to TensorFlow tensors


test_encodings = tokenizer(X_test,
                           truncation=True,
                           padding=True,
                           max_length=MAX_LEN,
                           return_tensors='tf')  # Convert to TensorFlow tensors


In [16]:

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), list(y_train.values)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), list(y_test.values)))


In [17]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(y_train.values)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(y_test.values)))

In [18]:
import tensorflow as tf

class Recall(tf.keras.metrics.Metric):
    def __init__(self, name='recall', **kwargs):
        super(Recall, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.float32)
        true_positives = tf.cast(tf.math.count_nonzero(y_true * y_pred), tf.float32)
        false_negatives = tf.cast(tf.math.count_nonzero(y_true * (1 - y_pred)), tf.float32)
        self.true_positives.assign_add(true_positives)
        self.false_negatives.assign_add(false_negatives)
    
    def result(self):
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        return recall
    
    def reset_states(self):
        self.true_positives.assign(0.0)
        self.false_negatives.assign(0.0)


In [19]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
#chose the optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
#define the loss function 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#build the model
model.compile(optimizer=optimizerr,
              loss=losss,
              metrics=[Recall()])
# train the model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/30
Epoch 2/30


  m.reset_state()


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x203dda97a20>

In [11]:
import pickle

# Save the model to a file
with open('recall_trained_model.pkl', 'wb') as f:
    pickle.dump(model, f)




Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...classifier
......vars
.........0
.........1
...distilbert
......vars
...distilbert\embeddings
......vars
.........0
.........1
...distilbert\embeddings\LayerNorm
......vars
.........0
.........1
...distilbert\embeddings\dropout
......vars
...distilbert\transformer
......vars
...distilbert\transformer\layer\tf_transformer_block
......vars
...distilbert\transformer\layer\tf_transformer_block\attention
......vars
...distilbert\transformer\layer\tf_transformer_block\attention\dropout
......vars
...distilbert\transformer\layer\tf_transformer_block\attention\k_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\out_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\q_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\v_lin
......vars
.........0
.........1
...distilbert\transformer

In [12]:

# Predict Test data
preds = model.predict(test_dataset).logits

preds = tf.nn.softmax(preds, axis=1).numpy()  

y_preds = np.round(preds[:,1],0)

from sklearn.metrics import classification_report

# Assuming you have the predicted labels 'y_pred' and the true labels 'y_true'
report = classification_report(y_test, y_preds)

print(report)


              precision    recall  f1-score   support

       False       0.86      0.71      0.78        35
        True       0.47      0.69      0.56        13

    accuracy                           0.71        48
   macro avg       0.67      0.70      0.67        48
weighted avg       0.76      0.71      0.72        48



### predict new unlabeled data

In [None]:
X_new_df = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/unlabeled_data.csv")['Title'])
pmid_unlabeld = pd.DataFrame(pd.read_csv("data/medi_crawler/processed/unlabeled_data.csv")['pmid'])

unlabeled_not_nan = ~X_new_df['Title'].isna()

X_new_df = X_new_df[unlabeled_not_nan]
pmid_unlabeld = pmid_unlabeld[unlabeled_not_nan]

In [None]:
X_new = X_new_df.squeeze().values

encodings = tokenizer(X_new.squeeze().tolist(), 
                      max_length=MAX_LEN, 
                      truncation=True, 
                      padding=True)
# Transform to tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))

# Predict
preds = model.predict(dataset).logits

preds = tf.nn.softmax(preds, axis=1).numpy()  

In [None]:
output = pd.concat([X_new_df,pd.DataFrame(preds[:,1],index=X_new_df.index,columns=['pred'])],axis=1).sort_values('pred',ascending=False)
output.head()

In [None]:
output.to_csv('data/medi_crawler/final/predict_label_title_recall.csv')

In [None]:
def predict_proba(text_list, model, tokenizer):  
    #tokenize the text
    encodings = tokenizer(text_list, 
                          max_length=MAX_LEN, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res

predict_proba(strings_list[3], model, tokenizer)[:,1].mean()