# BERT PRETRAINING ON POLITICAL TWEETS

Resuming of BERT pretraining on the task of Masked Language Modeling on the [Election Tweets 2020 Dataset](https://www.kaggle.com/datasets/manchunhui/us-election-2020-tweets).

### PACKAGE INSTALLING

In [None]:
!pip install fasttext
!pip install tokenizers -U
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 4.3 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3164973 sha256=4a4fa02983dd44ed24bd8d637ccee446f24835c3f03489d5ed602baa520813f7
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.0-cp37-cp37m-manylinux

### DRIVE LINKING

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### CONSTANTS

In [None]:
data_folder = "drive/MyDrive/DeepLearning/Dataset/twitter_elections/"
model_folder = "drive/MyDrive/DeepLearning/Models/Pretrained_frozen/"

### IMPORTS

In [None]:
import os

import pandas 
import numpy as np

import re

import fasttext

import tokenizers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, LineByLineTextDataset, BertModel, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

### CREATING THE DATASET 

In [None]:
def open_dataset(folder):

    dataset = []

    for f in os.listdir(folder):
        df = pandas.read_csv(folder + f, lineterminator='\n')

        tweets = df['tweet'].to_list()

        dataset.extend(tweets)
    
    return dataset

def save_dataset(dataset, f_name):
    
    np.savetxt(f_name, dataset, fmt='%s')

In [None]:
def process_dataset(dataset, language_model_path):

    model = fasttext.load_model(language_model_path)

    def process_tweet(tweet):

        def remove_newlines(tweet): return re.sub("\n"," ", tweet)

        def remove_last_hml(tweet):

            st = tweet.split(' ')
            num_words = len(st)

            # This deletes all the words that are in the end of the
            # tweet and that start with # or @ or are a link
            for i in range(num_words-1, -1, -1):
                if not len(st[i]): 
                    continue

                c = st[i][0] 
                if c == '@' or c == '#' or st[i].startswith('http'): 
                    st.pop(i)
                else : 
                    break

            return " ".join(st)

        def has_too_many_mentions(tweet):

            num_words = len(tweet.split(' '))
            max_hash_mentions = max(1, min(num_words // 6, 5))
            num_hash_mentions = tweet.count('#') + tweet.count('@') 
            return num_hash_mentions > max_hash_mentions

        def clean_tweet(tweet):

            tweet = re.sub("&amp","", tweet)
            tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
            tweet = re.sub("_","", tweet)
            tweet = re.sub(r"[^\w\s.,:!?;'%]|#|http\S+", "", tweet)
            return tweet

        def is_english(tweet):

            lan, _ = model.predict(tweet) 
            return lan[0] == '__label__en'

        tweet = remove_newlines(tweet)

        tweet = remove_last_hml(tweet)

        if has_too_many_mentions(tweet): 
            return ''

        tweet = tweet.replace('@realDonaldTrump', "Donald Trump")
        tweet = tweet.replace('@JoeBiden', 'Joe Biden')

        tweet = clean_tweet(tweet)

        if not is_english(tweet): 
            return ''

        tweet = " ".join(tweet.split())   # remove unnecessary spaces

        num_words = len(tweet.split())
        if num_words > 100 or num_words < 4 : 
            return ''

        return tweet

    dataset = [process_tweet(t) for t in dataset]
    dataset = [t for t in dataset if t != '']

    return dataset

In [None]:
def create_and_save_dataset():

    dataset = open_dataset(data_folder + "csv_datasets/")

    dataset_processed = process_dataset(
        dataset, data_folder + 'lid.176.ftz')

    save_dataset(dataset_processed, data_folder + "twitter_dataset.txt")

In [None]:
def save_reduced_dataset(N):

    dataset = np.loadtxt(
        data_folder + "twitter_dataset.txt", dtype=str, delimiter='\n')
    np.random.shuffle(dataset)
    save_dataset(dataset[0:N], data_folder + "twitter_dataset_red.txt")

### TOKENIZER TRAINING 
(not used)

In [None]:
def train_tokenizer(f_path):

    bwpt = tokenizers.BertWordPieceTokenizer(unk_token="[UNK]")

    bwpt.train(
        files=[f_path],
        vocab_size=50000,
        min_frequency=3,
        limit_alphabet=1000,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )

    return bwpt

def save_tokenizer(tokenizer, f_path): 
    tokenizer.save(f_path)

In [None]:
f_path_load = data_folder + "twitter_dataset.txt"
f_path_save = model_folder + "tokenizer_red.json"

### BERT TRAINING

In [None]:
def freeze_bert_layers(model, num_to_freeze):

    names_to_freeze = [
        'bert.encoder.layer.{}'.format(i) for i in range(0, num_to_freeze)
    ]

    for name, param in model.named_parameters():
        for to_freeze in names_to_freeze:
            if name.startswith(to_freeze):
                param.requires_grad = False

def count_parameters(model):

    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
class CustomTrainer(Trainer):


    def __init__(self, model, args, data_collator, train_dataset):

        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=dataset
        )

    def create_optimizer(self):

        param_optimizer = list(filter(
            lambda p: p[1].requires_grad, self.model.named_parameters()))
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(
                        nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(
                        nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        self.optimizer = AdamW(optimizer_parameters)


def pretrain_bert(model, 
                  dataset, 
                  tokenizer,
                  model_folder,  
                  epochs=5, 
                  resume_from_checkpoint=False):
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = TrainingArguments(
        output_dir=model_folder,
        overwrite_output_dir=False,
        num_train_epochs=epochs,
        per_device_train_batch_size=32,
        save_steps=10_000,
        save_total_limit=2)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset)

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    trainer.save_model(model_folder)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=data_folder + "twitter_dataset_red.txt",
    block_size=128)

print('No. of lines: ', len(dataset))

model = BertForMaskedLM.from_pretrained('bert-base-uncased')
freeze_bert_layers(model, 2)

print('No of parameters: ', model.num_parameters())
print('No of trainable parameters: ', count_parameters(model))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]



No. of lines:  500000


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


No of parameters:  109514298
No of trainable parameters:  81162810


In [None]:
pretrain_bert(model, 
              dataset, 
              tokenizer, 
              model_folder, 
              epochs=3, 
              resume_from_checkpoint=True)

Loading model from drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-20000.
You are resuming training from a checkpoint trained with 4.22.1 of Transformers but your current version is 4.22.2. This is not recommended and could yield to errors or unwanted behaviors.
***** Running training *****
  Num examples = 500000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 46875
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 20000
  Will skip the first 1 epochs then the first 4375 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/4375 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
20500,2.3145
21000,2.305
21500,2.3141
22000,2.3284
22500,2.3046
23000,2.3202
23500,2.288
24000,2.2963
24500,2.2885
25000,2.2979


Saving model checkpoint to drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-30000
Configuration saved in drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-30000/config.json
Model weights saved in drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-40000
Configuration saved in drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-40000/config.json
Model weights saved in drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [drive/MyDrive/DeepLearning/Models/Pretrained_frozen/checkpoint-20000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to drive/MyDrive/D