In [1]:
import pandas as pd
from google.colab import drive
import numpy as np
# drive.mount('/content/drive')

In [13]:
df = pd.read_parquet('/content/drive/MyDrive/train-00000-of-00001.parquet')
df_test = pd.read_parquet('/content/drive/MyDrive/test-00000-of-00001.parquet')
df_validate = pd.read_parquet('/content/drive/MyDrive/validation-00000-of-00001.parquet')

In [2]:
import tensorflow as tf
import sklearn.preprocessing as sk_pre
import re

## Date Pre-Processing

In [82]:
# Pre-preprocessing the text, removing the single letter words amd special characters from the text
def preprocess_text(sen):
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [None]:
X, y = df.text.values, df.labels.values
X = [preprocess_text(''.join(x)) for x in X]
X_test, y_test = df_test.text.values, df_test.labels.values
X_test = [preprocess_text(''.join(x)) for x in X_test]
X_validate, y_validate = df_validate.text.values, df_validate.labels.values
X_validate = [preprocess_text(''.join(x)) for x in X_validate]

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X_test = tokenizer.texts_to_sequences(X_test)
X_validate = tokenizer.texts_to_sequences(X_validate)

In [None]:
multi_label = sk_pre.MultiLabelBinarizer()
y = multi_label.fit_transform(y)
y_test = multi_label.fit_transform(y_test)
y_validate = multi_label.fit_transform(y_validate)

In [None]:
max_length = 200
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_length, padding='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_length, padding='post')
X_validate = tf.keras.preprocessing.sequence.pad_sequences(X_validate, maxlen=max_length, padding = 'post')

## Traing/Fine-Tune on given domain-specific dataset

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(10, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, validation_data=(X_validate, y_validate))

# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {evaluation[1]}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.46700000762939453


Above we give an example of a LSTM based end to end simple model, which does not include any pre-trained model. This performance can be improved significantly if we use BERT based model since we have large and complex texts that are required to be labelled and BERT excels and capturing contextual information.

## Incorporating BERT language Model Internally in the system Archictecture

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report

In [70]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [71]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
X = df.text.values
X = [preprocess_text(''.join(x)) for x in X]
X_test= df_test.text.values
X_test = [preprocess_text(''.join(x)) for x in X_test]
X_validate = df_validate.text.values
X_validate = [preprocess_text(''.join(x)) for x in X_validate]
y = df.labels.values
y_test = df_test.labels.values
y_validate = df_validate.labels.values
multi_label = sk_pre.MultiLabelBinarizer()
y = multi_label.fit_transform(y)
y_test = multi_label.fit_transform(y_test)
y_validate = multi_label.fit_transform(y_validate)

In [73]:
train_encodings = tokenizer(X, truncation=True, padding=True, max_length = 128, return_tensors='pt')

In [74]:
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length = 128, return_tensors='pt')

In [75]:
y_tensor, y_test_tensor = torch.tensor(y, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

In [76]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 2, shuffle = True)

In [77]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [78]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [80]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.sigmoid(logits).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Convert probabilities to binary predictions
threshold = 0.5
binary_preds = (np.array(all_preds) > threshold).astype(int)

# Print classification report
print(classification_report(all_labels, binary_preds, target_names=["1","2","3","4","5","6","7","8","9","10"]))

              precision    recall  f1-score   support

           1       0.86      0.57      0.68        76
           2       0.64      0.70      0.67       234
           3       0.74      0.44      0.55       196
           4       0.65      0.63      0.64       394
           5       0.78      0.34      0.47       188
           6       1.00      0.18      0.31        11
           7       0.74      0.76      0.75       106
           8       0.57      0.53      0.55        43
           9       0.57      0.12      0.21        32
          10       0.75      0.75      0.75       155

   micro avg       0.69      0.58      0.63      1435
   macro avg       0.73      0.50      0.56      1435
weighted avg       0.70      0.58      0.62      1435
 samples avg       0.68      0.62      0.62      1435



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Evaluating the effectiveness of fine tuning handling domains specific NLP Tasks


1. Here we are encoroporating a model internally, the model (BertForSequenceClassification) is pre-trained on a generic language modeling objective, and then it's fine-tuned for a multi-label text classification task using a dataset specific to our domain.
2. As we can see, using a BERT model have the micro avg f1-score of 0.63. and macro average score of 0.56, which is a better performance then end-to-end classification model using LSTM.