In [None]:
from transformers import DistilBertTokenizerFast, DistilBertModel
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased',
                                                    truncation=True,padding='max_length',max_length=512)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

X_train,X_test ,y_train,y_test = train_test_split(df['Content'].to_list(),
                                                  df['label'].to_list(), random_state = 2020, 
                                                  test_size = 0.3,
                                                  stratify=df['label'].to_list())

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding='max_length',max_length=512, return_tensors='tf')
test_encodings = tokenizer(X_test, truncation=True, padding='max_length',max_length=512, return_tensors='tf',)
ytrain_encoded = tf.keras.utils.to_categorical(y_train, num_classes=2,dtype = 'float32')
ytest_encoded = tf.keras.utils.to_categorical(y_test, num_classes=2,dtype = 'float32')

In [None]:
train_encodings=train_encodings['input_ids']
test_encodings=test_encodings['input_ids']

In [None]:
def build_model(transformer, loss='categorical_crossentropy', max_len=512):
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    #adding dropout layer
    x = tf.keras.layers.Dropout(0.3)(cls_token)
    #using a dense layer of 40 neurons as the number of unique categories is 40. 
    out = tf.keras.layers.Dense(2, activation='softmax')(x)
    model = tf.keras.Model(inputs=input_word_ids, outputs=out)
    #using categorical crossentropy as the loss as it is a multi-class classification problem
    model.compile(tf.keras.optimizers.Adam(lr=3e-5), loss=loss, metrics=F1_Score())
    return model

In [None]:
transformer_layer = transformers.TFAutoModel.from_pretrained('distilbert-base-uncased')
model = build_model(transformer_layer, max_len=512)
model.summary()

In [None]:
BATCH_SIZE = 16
AUTO = tf.data.experimental.AUTOTUNE
Xtrain_dataset = tf.data.Dataset.from_tensor_slices((
  train_encodings,ytrain_encoded)).shuffle(64).repeat().batch(BATCH_SIZE).prefetch(AUTO)
 
Xtest_dataset = tf.data.Dataset.from_tensor_slices((
    test_encodings, ytest_encoded)).batch(4).prefetch(AUTO)

In [None]:
n_steps = len(X_train)// BATCH_SIZE
train_history = model.fit(
    Xtrain_dataset,
    steps_per_epoch=n_steps,validation_data=Xtest_dataset,
    epochs=10
)

In [None]:
#making predictions
preds = model.predict(Xtest_dataset,verbose = 1)
#converting the one hot vector output to a linear numpy array.
pred_classes = np.argmax(preds, axis = 1)