In [None]:
import pandas as pd
import numpy as np
from gensim import models
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
!pip install transformers
import transformers
from transformers import BertTokenizer, TrainingArguments, Trainer, AutoTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
import torch
from torch.utils.data import random_split, TensorDataset, RandomSampler, SequentialSampler, DataLoader
from tqdm import tqdm

!pip install mlflow
import mlflow
import mlflow.sklearn
import mlflow.transformers

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


For reproducibility, let's set a single random seed.

In [None]:
import random
import os

RANDOM_SEED = 42
def seed_everything(seed):
    # fix all types of random that we can
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(RANDOM_SEED)

Now, let's load the train set and explore it.

In [None]:
# face_masks_train = pd.read_csv('https://raw.githubusercontent.com/kglandt/stance-detection-in-covid-19-tweets/main/dataset/face_masks_train.csv', index_col = 0)
# face_masks_train.head(10)
# ! kaggle datasets download -d thedevastator/tweeteval-a-multi-task-classification-benchmark
# ! pip install kaggle
# ! python3 kaggle datasets download -d sadoukhamzatarik/stance-detection-dataset
df_train = pd.read_csv('train_stances.csv')
df_train.head(10)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
6,Accused Boston Marathon Bomber Severely Injure...,962,unrelated
7,Identity of ISIS terrorist known as 'Jihadi Jo...,2033,unrelated
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
9,British Aid Worker Confirmed Murdered By ISIS,882,unrelated


In [None]:
df_train.shape

(49972, 3)

In [None]:
len(df_train['Body ID'].unique())

1683

In [None]:
df_bodies_train = pd.read_csv('train_bodies.csv')
df_bodies_train.head(10)

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...
5,8,There is so much fake stuff on the Internet in...
6,9,"(CNN) -- A meteorite crashed down in Managua, ..."
7,10,"Move over, Netflix and Hulu.\nWord has it that..."
8,11,We’ve all seen the traditional depictions of G...
9,13,A SOLDIER has been shot at Canada’s National W...


In [None]:
df_bodies_train.shape

(1683, 2)

In [None]:
df_bodies_train[df_bodies_train['Body ID'] == 6].articleBody[3]

'Posting photos of a gun-toting child online, ISIS supporters announced that the group’s youngest soldier has died in combat.\n\nTwitter accounts linked to the Islamic State of Iraq and Al-Sham claimed that the child soldier “got martyred” with his father while fighting for the terrorist group in Syria.\n\nPhotos posted on Twitter showed the smiling boy in military fatigues holding weapons that, at times, are almost as large as his body. British media reported that the child was roughly 10 years old.\n\nThe photos of the boy first emerged in June, said Charlie Cooper, a researcher who monitors ISIS social media for the London-based Quilliam counter-extremism think tank.\nIn the past week, Mr. Cooper has noticed the hashtag “shibal_alBaghdadi” — which translates as “the cub of Baghdadi” — on Twitter accounts linked to ISIS.\n\nWhile ISIS fighters commonly refer to themselves as lions of the Islamic State, Mr. Cooper said, they refer to child soldiers as cubs of Abu Bakr Al-Baghdadi, ISI

And now let's load the test set.

In [None]:
df_test = pd.read_csv('competition_test_stances.csv')
df_bodies_test = pd.read_csv('competition_test_bodies.csv')

In [None]:
df_train['Stance'].unique()

array(['unrelated', 'agree', 'disagree', 'discuss'], dtype=object)

We will use label encoder to encode stance text labels to integers. We also save the classes numpy array to use it in the service.

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(df_train['Stance'])
print(le.classes_)
y_test = le.transform(df_test['Stance'])

['agree' 'disagree' 'discuss' 'unrelated']


In [None]:
np.save('classes.npy', le.classes_)

For our baseline model we will use Random forest classifier with word2vec tokenizer from Google News vectors.

In [None]:
!wget -O GoogleNews-vectors-negative300.bin.gz "drive.google.com/u/3/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes"
!gzip -d GoogleNews-vectors-negative300.bin.gz

--2023-11-11 16:37:19--  http://drive.google.com/u/3/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes
Resolving drive.google.com (drive.google.com)... 108.177.127.100, 108.177.127.102, 108.177.127.139, ...
Connecting to drive.google.com (drive.google.com)|108.177.127.100|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://drive.google.com/u/3/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes [following]
--2023-11-11 16:37:19--  https://drive.google.com/u/3/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes
Connecting to drive.google.com (drive.google.com)|108.177.127.100|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://drive.google.com/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes [following]
--2023-11-11 16:37:19--  https://drive.google.com/uc?id=1pPPHMNjJAb82-xYFFJLz_kqcnNhmwAhW&export=download&confirm=yes
Reusing existing con

In [None]:
wv_model = models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

We will use vectorizer class to combine headlines and article bodies and also to calculate mean vectors across sentences.

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec, bodies, stopwords=None):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size
        self.stopwords = stopwords
        self.regex = re.compile("[A-Za-z']+")

        self.body_vectors = dict()
        bodies.apply(self.save_body_vector, axis = 1)

    def fit(self, X, y):
        return self

    def transform(self, X):
        sentences = X.apply(self.combine_text_features, axis = 1)
        return np.array([
                np.mean([self.word2vec[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in sentences
            ])

    def vectorize(self, text):
        words = self.regex.findall(text.lower().strip())
        if self.stopwords:
            return [word for word in words if not word in self.stopwords]
        else:
            return words

    def save_body_vector(self, row):
        self.body_vectors[row['Body ID']] = self.vectorize(row['articleBody'])

    def combine_text_features(self, row):
        result = self.vectorize(row['Headline'])
        result.extend(self.body_vectors[row['Body ID']])
        return result

In [None]:
train_vectorizer = MeanEmbeddingVectorizer(wv_model, df_bodies_train)
test_vectorizer = MeanEmbeddingVectorizer(wv_model, df_bodies_test)

To track experiments we will use MLFlow on Databricks community server.

In [None]:
!databricks configure --host https://community.cloud.databricks.com/
MLFLOW_SERVER_URL = 'databricks'
mlflow.set_tracking_uri(MLFLOW_SERVER_URL)

Username: antonegorov71@gmail.com
Password: 
Repeat for confirmation: 


In [None]:
experiment_name = '/Users/antonegorov71@gmail.com/baseline_experiment'
mlflow.set_experiment(experiment_name)

In [None]:
X_train_w2v = train_vectorizer.transform(df_train)
X_test_w2v  = test_vectorizer.transform(df_test)

with mlflow.start_run():
  n_estimators = 100
  clf_w2v = RandomForestClassifier(n_estimators = n_estimators, random_state=RANDOM_SEED)
  clf_w2v = clf_w2v.fit(X_train_w2v, y_train)

  pred = clf_w2v.predict(X_test_w2v)
  accuracy = accuracy_score(y_test, pred)
  print('Accuracy:', accuracy)
  mlflow.log_param("n_estimators", n_estimators)
  mlflow.log_metric("accuracy", accuracy)
  mlflow.sklearn.log_model(clf_w2v, "model")

['agree' 'disagree' 'discuss' 'unrelated']
Accuracy: 0.7053870066501396




Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Now we will try BERT tokenizer and model. First, we will join dataframes with stance labels and article bodies.

In [None]:
df_with_bodies_train = df_train.set_index('Body ID').join(df_bodies_train.set_index('Body ID'))

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Now we can tokenize the train examples and wrap them into data loaders for training.

In [None]:
max_length = 150
transformers.logging.set_verbosity_error()

X_train_bert = tokenizer(df_with_bodies_train['Headline'].tolist(),
                         text_pair=df_with_bodies_train['articleBody'].tolist(), padding='max_length',
                         max_length=max_length, truncation='only_second', return_tensors = 'pt')

input_ids = X_train_bert['input_ids']
attention_masks = X_train_bert['attention_mask']
labels = torch.tensor(y_train, dtype=torch.long)

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
dev_size = len(dataset) - train_size

generator_bert = torch.Generator().manual_seed(RANDOM_SEED)
train_dataset, dev_dataset = random_split(dataset, [train_size, dev_size], generator=generator_bert)
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(dev_size))

44,974 training samples
4,998 validation samples


In [None]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )
validation_dataloader = DataLoader(
            dev_dataset,
            sampler = SequentialSampler(dev_dataset),
            batch_size = batch_size
        )

We will do training on the available device.

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
epochs = 2

total_steps = len(train_dataloader) * epochs

Now, let's define two utilite functions - for formatting time and calculating accuracy.

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

Function train() will be used in our experiments to train the given model for one epoch.

In [None]:
def train(model, optimizer):
  print('Training...')
  t0 = time.time()
  total_train_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()

    res = model(b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels)
    loss = res['loss']
    logits = res['logits']
    total_train_loss += loss.item()

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

  avg_train_loss = total_train_loss / len(train_dataloader)
  training_time = format_time(time.time() - t0)

  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))

  print("")
  print("Running Validation...")
  t0 = time.time()

  model.eval()

  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
      res = model(b_input_ids,
                  token_type_ids=None,
                  attention_mask=b_input_mask,
                  labels=b_labels)
    loss = res['loss']
    logits = res['logits']
    total_eval_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_eval_accuracy += flat_accuracy(logits, label_ids)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  validation_time = format_time(time.time() - t0)

  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))

  return avg_train_loss, avg_val_loss, avg_val_accuracy



Now we will train the model, using three different learning rates.

In [None]:
experiment_name = '/Users/antonegorov71@gmail.com/experiment-with-bert'

mlflow.set_experiment(experiment_name)
for lr in (5e-5, 3e-5, 2e-5):
  model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 4,
        output_attentions = False,
        output_hidden_states = False
  )
  model = model.to(device)
  optimizer = AdamW(model.parameters(),
                    lr = lr,
                    eps = 1e-8
              )
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)
  total_t0 = time.time()
  print(f'Started BERT training for lr = {lr}')
  for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    with mlflow.start_run():
      avg_train_loss, avg_val_loss, avg_val_accuracy = train(model, optimizer)
      mlflow.log_param("lr", lr)
      mlflow.log_param('epoch', epoch_i + 1)
      mlflow.log_metric('train.loss', avg_train_loss)
      mlflow.log_metric('valid.loss', avg_val_loss)
      mlflow.log_metric('valid.accuracy', avg_val_accuracy)
      components = {
        "model": model,
        "tokenizer": tokenizer,
      }
      mlflow.transformers.log_model(components, "model")
  print("")
  print("Training complete!")
  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Started BERT training for lr = 5e-05

Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:31.
  Batch   440  of  1,406.    Elapsed: 0:06:04.
  Batch   480  of  1,406.    Elapsed: 0:06:37.
  Batch   520  of  1,406.    Elapsed: 0:07:10.
  Batch   560  of  1,406.    Elapsed: 0:07:43.
  Batch   600  of  1,406.    Elapsed: 0:08:16.
  Batch   640  of  1,406.    Elapsed: 0:08:49.
  Batch   680  of  1,406.    Elapsed: 0:09:23.
  Batch   720  of  1,406.    Elapsed: 0:09:56.
  Batch   760  of  1,406.    Elapsed: 0:10:29.
  Batch   800  of  1,406.    Elapsed: 0:11:02.
  Batch   

  mlflow.transformers.log_model(components, "model")


Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 18:23:53 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:31.
  Batch   440  of  1,406.    Elapsed: 0:06:04.
  Batch   480  of  1,406.    Elapsed: 0:06:37.
  Batch   520  of  1,406.    Elapsed: 0:07:10.
  Batch   560  of  1,406.    Elapsed: 0:07:43.
  Batch   600  of  1,406.    Elapsed: 0:08:16.
  Batch   640  of  1,406.    Elapsed: 0:08:49.
  Batch   680  of  1,406.    Elapsed: 0:09:22.
  Batch   720  of  1,406.    Elapsed: 0:09:55.
  Batch   760  of  1,406.    Elapsed: 0:10:29.
  Batch   800  of  1,406.    Elapsed: 0:11:02.
  Batch   840  of  1,406.    Elapsed: 0:11:35.




Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 18:44:31 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training complete!
Total training took 0:41:17 (h:mm:ss)
Started BERT training for lr = 3e-05

Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:31.
  Batch   440  of  1,406.    Elapsed: 0:06:04.
  Batch   480  of  1,406.    Elapsed: 0:06:37.
  Batch   520  of  1,406.    Elapsed: 0:07:10.
  Batch   560  of  1,406.    Elapsed: 0:07:44.
  Batch   600  of  1,406.    Elapsed: 0:08:17.
  Batch   640  of  1,406.    Elapsed: 0:08:50.
  Batch   680  of  1,406.    Elapsed: 0:09:23.
  Batch   720  of  1,406.    Elapsed: 0:09:56.
  Batch   760  of  1,406.    Elapsed: 0:10:29.



Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 19:05:17 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:31.
  Batch   440  of  1,406.    Elapsed: 0:06:04.
  Batch   480  of  1,406.    Elapsed: 0:06:37.
  Batch   520  of  1,406.    Elapsed: 0:07:10.
  Batch   560  of  1,406.    Elapsed: 0:07:44.
  Batch   600  of  1,406.    Elapsed: 0:08:17.
  Batch   640  of  1,406.    Elapsed: 0:08:50.
  Batch   680  of  1,406.    Elapsed: 0:09:23.
  Batch   720  of  1,406.    Elapsed: 0:09:56.
  Batch   760  of  1,406.    Elapsed: 0:10:29.
  Batch   800  of  1,406.    Elapsed: 0:11:02.
  Batch   840  of  1,406.    Elapsed: 0:11:35.




Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 19:25:57 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training complete!
Total training took 0:41:21 (h:mm:ss)
Started BERT training for lr = 2e-05

Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:32.
  Batch   440  of  1,406.    Elapsed: 0:06:05.
  Batch   480  of  1,406.    Elapsed: 0:06:38.
  Batch   520  of  1,406.    Elapsed: 0:07:11.
  Batch   560  of  1,406.    Elapsed: 0:07:44.
  Batch   600  of  1,406.    Elapsed: 0:08:17.
  Batch   640  of  1,406.    Elapsed: 0:08:50.
  Batch   680  of  1,406.    Elapsed: 0:09:23.
  Batch   720  of  1,406.    Elapsed: 0:09:56.
  Batch   760  of  1,406.    Elapsed: 0:10:29.



Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 19:46:35 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training...
  Batch    40  of  1,406.    Elapsed: 0:00:33.
  Batch    80  of  1,406.    Elapsed: 0:01:07.
  Batch   120  of  1,406.    Elapsed: 0:01:39.
  Batch   160  of  1,406.    Elapsed: 0:02:13.
  Batch   200  of  1,406.    Elapsed: 0:02:46.
  Batch   240  of  1,406.    Elapsed: 0:03:19.
  Batch   280  of  1,406.    Elapsed: 0:03:52.
  Batch   320  of  1,406.    Elapsed: 0:04:25.
  Batch   360  of  1,406.    Elapsed: 0:04:58.
  Batch   400  of  1,406.    Elapsed: 0:05:31.
  Batch   440  of  1,406.    Elapsed: 0:06:04.
  Batch   480  of  1,406.    Elapsed: 0:06:37.
  Batch   520  of  1,406.    Elapsed: 0:07:10.
  Batch   560  of  1,406.    Elapsed: 0:07:43.
  Batch   600  of  1,406.    Elapsed: 0:08:16.
  Batch   640  of  1,406.    Elapsed: 0:08:50.
  Batch   680  of  1,406.    Elapsed: 0:09:23.
  Batch   720  of  1,406.    Elapsed: 0:09:56.
  Batch   760  of  1,406.    Elapsed: 0:10:29.
  Batch   800  of  1,406.    Elapsed: 0:11:02.
  Batch   840  of  1,406.    Elapsed: 0:11:35.




Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/11 20:07:13 INFO mlflow.store.artifact.cloud_artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false



Training complete!
Total training took 0:41:15 (h:mm:ss)


Now let's compare the results and choose the model with the best metrics.

In [None]:
client = mlflow.tracking.MlflowClient(MLFLOW_SERVER_URL)
baseline_experiment = client.get_experiment_by_name('/Users/antonegorov71@gmail.com/baseline_experiment')
baseline_run = client.search_runs(baseline_experiment.experiment_id)[0]
baseline_accuracy = baseline_run.data.metrics['accuracy']
print(f'Baseline accuracy: {baseline_accuracy}')

Baseline accuracy: 0.7053870066501396


In [None]:
bert_experiment = client.get_experiment_by_name('/Users/antonegorov71@gmail.com/experiment-with-bert')
epoch_with_min_loss_for_lr = dict()
for run in client.search_runs(bert_experiment.experiment_id):
    if run.data.params['lr'] in epoch_with_min_loss_for_lr:
      if run.data.metrics['valid.loss'] < epoch_with_min_loss_for_lr[run.data.params['lr']][1]:
        epoch_with_min_loss_for_lr[run.data.params['lr']] = (run.info.run_id, run.data.metrics['valid.loss'])
    else:
      epoch_with_min_loss_for_lr[run.data.params['lr']] = (run.info.run_id, run.data.metrics['valid.loss'])

best_accuracy = baseline_accuracy
best_run_id = baseline_run.info.run_id
for run_id, loss in epoch_with_min_loss_for_lr.values():
    run = client.get_run(run_id)
    if run.data.metrics['valid.accuracy'] > best_accuracy:
        best_accuracy = run.data.metrics['valid.accuracy']
        best_run_id = run_id

if best_run_id != baseline_run.info.run_id:
    run = client.get_run(best_run_id)
    lr = run.data.params['lr']
    print(f'Baseline model was beaten by BERT model, fine-tuned with lr={lr} and resulted in validation accuracy {best_accuracy}')
else:
    print('Baseline model was not beaten')

Baseline model was beaten by BERT model, fine-tuned with lr=2e-05 and resulted in validation accuracy 0.7236597664543524


For the final evaluation, we will load the best model from the MLFlow and calculate the accuracy on the test set. We will also save this model for using in our web service.

In [None]:
model_uri = f"runs:/{best_run_id}/model"
components = mlflow.transformers.load_model(model_uri, return_type='components')
tokenizer = components['tokenizer']
model = components['model']

  components = mlflow.transformers.load_model(model_uri, return_type='components')


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/11/12 05:43:58 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
2023/11/12 05:44:12 INFO mlflow.transformers: 'runs:/1a473d61cbcd4cf9aaf24856dfd8eff3/model' resolved as 'dbfs:/databricks/mlflow-tracking/2684602522190470/1a473d61cbcd4cf9aaf24856dfd8eff3/artifacts/model'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
output_dir = "./bert_saved_model/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model is saved")

Saving model to ./bert_saved_model/
Model is saved


In [None]:
max_length = 150
df_with_bodies_test = df_test.set_index('Body ID').join(df_bodies_test.set_index('Body ID')).head(2500)
X_val_bert = tokenizer(df_with_bodies_test['Headline'].tolist(),
                         text_pair=df_with_bodies_test['articleBody'].tolist(), padding='max_length',
                         max_length=max_length, truncation='only_second', return_tensors = 'pt')

test_input_ids = X_val_bert['input_ids']
test_attention_masks = X_val_bert['attention_mask']
test_labels = torch.tensor(y_test[:2500], dtype=torch.long)

In [None]:
batch_size = 32

prediction_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))
model.to(device)
model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 2,500 test sentences...
    DONE.


In [None]:
total_eval_accuracy = 0
print('Calculating Accuracy for each batch...')

for i in range(len(true_labels)):
    total_eval_accuracy += flat_accuracy(predictions[i], true_labels[i])
avg_val_accuracy = total_eval_accuracy / len(prediction_dataloader)
print("  Accuracy:", avg_val_accuracy)

Calculating Accuracy for each batch...
  Accuracy: 0.7325949367088608


Finally, let's try to optimize this model by applying quantization. First, let's calculate the size of the model before and after the quantization.

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [None]:
print_size_of_model(model)

Size (MB): 438.006649


In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [None]:
print_size_of_model(quantized_model)

Size (MB): 181.481301


Now, let's check the performance of the quantized model on the test set.

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))
quantized_model.to(device)
quantized_model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs = quantized_model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 2,500 test sentences...
    DONE.


In [None]:
total_eval_accuracy = 0
print('Calculating Accuracy for each batch...')

for i in range(len(true_labels)):
    total_eval_accuracy += flat_accuracy(predictions[i], true_labels[i])
avg_val_accuracy = total_eval_accuracy / len(prediction_dataloader)
print("  Accuracy:", avg_val_accuracy)

Calculating Accuracy for each batch...
  Accuracy: 0.7325949367088608


As we can see, the accuracy hasn't degraded.

I used the following guide to quantize the BERT model: https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html. Unfortunately I didn't manage to save and load the quantized model successfully with the provided code - either saving or making predictions using the loaded model failed. So in the service I had to load the full model and quantize it on start.

So, in this part of the project we tried several models for stance classification for Fake News detection. We used Random Forest classificator and fine-tuned BERT model. We used MLFlow to track the results of our experiments and to choose the best model. We also used quantization to decrease the size of the model and make it more robust.