# Task 3: Classification of COVID19 tweets containing symptoms

In [1]:
!pip install -q transformers contractions imbalanced-learn ekphrasis

[K     |████████████████████████████████| 4.0 MB 10.9 MB/s 
[K     |████████████████████████████████| 80 kB 8.6 MB/s 
[K     |████████████████████████████████| 895 kB 46.1 MB/s 
[K     |████████████████████████████████| 77 kB 7.9 MB/s 
[K     |████████████████████████████████| 596 kB 67.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 8.4 MB/s 
[K     |████████████████████████████████| 287 kB 70.8 MB/s 
[K     |████████████████████████████████| 106 kB 37.9 MB/s 
[K     |████████████████████████████████| 45 kB 1.9 MB/s 
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
[?25h  Building wheel for ekphrasis (setup.py) ... [?25l[?25hdone


## 1. Import all the necessary libraries and data files

In [19]:
import numpy as np
import pandas as pd

import warnings
import torch
import torch.nn as nn
import time

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizerFast
from transformers import RobertaTokenizerFast, RobertaModel
from transformers import AutoTokenizer, AutoModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None

In [4]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

In [5]:
# Load data
train = pd.read_csv(train_filename, sep="\t")
validation = pd.read_csv(val_filename, sep="\t")

In [6]:
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (9067, 3) and validation data is (500, 3)


In [7]:
# Train top 5 rows
train.head().style.set_caption("Task 3: Train dataset")

Unnamed: 0,tweet_id,tweet,label
0,13729,A growing number of Covid-19 patients whose symptoms were initially mild are now facing mysterious long-term neurological problems https://t.co/If2SgRduuw,Lit-News_mentions
1,12399,"Medical experts advise that symptoms of the novel coronavirus include fever, shortness of breath, and stinky smelly pits and feet 😳🤪",Lit-News_mentions
2,20056,"@drdavidsamadi Hubby/I:same symptoms n November 2019 after a weekend trip 2 Vegas where bus loads of Chinese tourists.1 day fever,3 days sore throat,several weeks of fatigue.He's healthy,I'm not: diabetes,hypertension,obese, respiratory issues @ 53. No meds/pneumonia,we believe was COVID-19",Nonpersonal_reports
3,10175,"1/x In the April 11 BC briefing Dr. Bonnie Henry had mentioned that there's now reports of neurologic complications after COVID-19 infection, even during recovery. There's now anecdotal reporting of neurological manifestations from WUHAN patients in JAMA:https://t.co/7spTyk7l2M",Lit-News_mentions
4,12179,Major study PHOSP-COVID investigates health impacts of #COVID19 on hospitalised patients including #mentalhealth & neurological problems. Find out more https://t.co/JLTrz0BA7f @OxfordHealthNHS https://t.co/jZ2kPyPqmS,Lit-News_mentions


## 2. Prepare the data - Clean & Prepare for Model

In [8]:
# Drop unwanted columns
train.drop(['tweet_id'], axis=1, inplace=True)
validation.drop(['tweet_id'], axis=1, inplace=True)

In [9]:
# Referred from: https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True, emojis=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [10]:
train['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in train.tweet]
validation['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in validation.tweet]

In [12]:
# Train top 5 rows after pre-processing
train[['label', 'clean_tweets']].head()

Unnamed: 0,label,clean_tweets
0,Lit-News_mentions,a growing number of covid - <number> patients whose symptoms were initially mild are now facing mysterious long - term neurological problems <url>
1,Lit-News_mentions,"medical experts advise that symptoms of the novel coronavirus include fever , shortness of breath , and stinky smelly pits and feet 😳 🤪"
2,Nonpersonal_reports,"<user> hubby / i : same symptoms n <date> after a weekend trip <number> vegas where bus loads of chinese tourists . <number> day fever , <number> days sore throat , several weeks of fatigue . he ' s healthy , i am not : diabetes , hypertension , obese , respiratory issues @ <number> . no meds / pneumonia , we believe was <allcaps> covid </allcaps> - <number>"
3,Lit-News_mentions,"<number> /x in the <date> bc briefing dr . bonnie henry had mentioned that there ' s now reports of neurologic complications after <allcaps> covid </allcaps> - <number> infection , even during recovery . there ' s now anecdotal reporting of neurological manifestations from <allcaps> wuhan </allcaps> patients in <allcaps> jama </allcaps> : <url>"
4,Lit-News_mentions,major study <allcaps> phosp </allcaps> - <allcaps> covid </allcaps> investigates health impacts of <hashtag> covid 19 </hashtag> on hospitalised patients including <hashtag> mental health </hashtag> & neurological problems . find out more <url> <user> <url>


In [13]:
# Define BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

Downloading:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [14]:
# Tokenize train and validation data
train_enc_bio = tokenizer.batch_encode_plus(train.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")
valid_enc_bio = tokenizer.batch_encode_plus(validation.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")

In [15]:
train_enc_bio.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [16]:
train_enc_bio.input_ids.shape, train_enc_bio.token_type_ids.shape, train_enc_bio.attention_mask.shape

(torch.Size([9067, 128]), torch.Size([9067, 128]), torch.Size([9067, 128]))

In [17]:
BATCH_SIZE = 32
N_EPOCHS = 5

In [18]:
def get_dataloader(encoding, target):
    data = (TensorDataset(encoding.input_ids, encoding.token_type_ids, encoding.attention_mask, target))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [20]:
le = LabelEncoder()
train.label = le.fit_transform(train.label)
validation.label = le.transform(validation.label)

In [21]:
le.classes_

array(['Lit-News_mentions', 'Nonpersonal_reports', 'Self_reports'],
      dtype=object)

In [22]:
train_dataloader_bio = get_dataloader(train_enc_bio, torch.tensor(train['label'].to_list()))
valid_dataloader_bio = get_dataloader(valid_enc_bio, torch.tensor(validation['label'].to_list()))

In [23]:
# Sanity check that the tensors returned by the dataloader are correct
for batch in train_dataloader_bio:
    input_ids, type_ids, attn_mask, target = batch
    print(input_ids.shape, type_ids.shape, attn_mask.shape, target.shape)
    break

torch.Size([32, 128]) torch.Size([32, 128]) torch.Size([32, 128]) torch.Size([32])


## 3. Model Building - BioBERT

In [24]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [26]:
class BioBERTclassifier(nn.Module):
    def __init__(self, transformer):
        super(BioBERTclassifier, self).__init__()
        self.transformer = transformer
        self.linear_layer = nn.Linear(768, 3)
    
    def forward(self, ip_ids, type_ids, attn_mask):
        op = self.transformer(input_ids=ip_ids,
                              attention_mask=attn_mask, 
                              token_type_ids=type_ids)
        return  self.linear_layer(op["pooler_output"])

In [27]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [28]:
transformer = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BioBERTclassifier(transformer).to(device)
print(f"The model has {count_parameter(model):,} trainable parameters.")

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The model has 108,312,579 trainable parameters.


In [29]:
# for name, param in model.named_parameters():
#     if "pooler" in name or "linear" in name:#or "layer.11" in name or "layer.10" in name or "linear" in name:
#         param.requires_grad = True
#     else:
#         param.requires_grad = False
#   print(name, param.shape, param.requires_grad)

# print(f'The model has {count_parameter(model):,} trainable parameters')

In [30]:
# Define optimizer and 
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr = 2e-5)

#### Train SciBERT Model

In [31]:
def train_model(model, dataloader, clip=1.0):
    model.train()

    epoch_loss = 0
    batch_num = 0
    pred, target = [], []

    for index, batch in tqdm(enumerate(dataloader)):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch

        optim.zero_grad()
        output = model(input_ids, type_ids, attn_mask)
        loss = criterion(output, y)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim.step()

        epoch_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro')

def evaluate(model, dataloader):
    model.eval()

    epoch_loss = 0
    batch_num = 0
    pred, target = list(), list()

    for index, batch in enumerate(dataloader):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch
        
        with torch.no_grad():
            output = model(input_ids, type_ids, attn_mask)
            loss = criterion(output, y)
            
            epoch_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro'), pred, target

In [32]:
best_valid_loss = float('inf')
total_train_loss, total_valid_loss = list(), list()

In [33]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_f1_score = train_model(model, train_dataloader_bio)
    total_train_loss.append(train_loss)

    valid_loss, valid_f1_score, pred, target = evaluate(model, valid_dataloader_bio)
    total_valid_loss.append(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_pred, best_target = pred, target
        torch.save(model.state_dict(), "model_least_loss.pt")
        print("\nBest Model Saved!!\n")
    
    # elif epoch % 3 == 0:
    torch.save(model.state_dict(), "model_checkpoint_bio" + str(epoch+1) + ".pt")
    print("\nCheckpoint Model Saved!\n")
    
    print(f"Epoch: {epoch+1:02}")
    print(f"Train Total Loss: {train_loss:.3f} | Train F1 Score: {train_f1_score:.3f}")
    print(f"Valid Total Loss: {valid_loss:.3f} | Valid F1 Score: {valid_f1_score:.3f}")
    print("-"*20)

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.13it/s][A
2it [00:01,  1.40it/s][A
3it [00:02,  1.50it/s][A
4it [00:02,  1.54it/s][A
5it [00:03,  1.58it/s][A
6it [00:03,  1.60it/s][A
7it [00:04,  1.61it/s][A
8it [00:05,  1.62it/s][A
9it [00:05,  1.62it/s][A
10it [00:06,  1.63it/s][A
11it [00:06,  1.64it/s][A
12it [00:07,  1.64it/s][A
13it [00:08,  1.64it/s][A
14it [00:08,  1.64it/s][A
15it [00:09,  1.63it/s][A
16it [00:10,  1.63it/s][A
17it [00:10,  1.63it/s][A
18it [00:11,  1.63it/s][A
19it [00:11,  1.63it/s][A
20it [00:12,  1.62it/s][A
21it [00:13,  1.59it/s][A
22it [00:13,  1.59it/s][A
23it [00:14,  1.60it/s][A
24it [00:15,  1.60it/s][A
25it [00:15,  1.61it/s][A
26it [00:16,  1.62it/s][A
27it [00:16,  1.61it/s][A
28it [00:17,  1.62it/s][A
29it [00:18,  1.63it/s][A
30it [00:18,  1.62it/s][A
31it [00:19,  1.62it/s][A
32it [00:19,  1.62it/s][A
33it [00:20,  1.63it/s][A
34it [00:21,  1.63it/s][A
35it [00:21,  1.63it/s][A
36it


Best Model Saved!!



 20%|██        | 1/5 [03:13<12:54, 193.53s/it]


Checkpoint Model Saved!

Epoch: 01
Train Total Loss: 0.223 | Train F1 Score: 0.926
Valid Total Loss: 0.066 | Valid F1 Score: 0.976
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.35it/s][A
2it [00:01,  1.44it/s][A
3it [00:02,  1.47it/s][A
4it [00:02,  1.48it/s][A
5it [00:03,  1.49it/s][A
6it [00:04,  1.48it/s][A
7it [00:04,  1.49it/s][A
8it [00:05,  1.48it/s][A
9it [00:06,  1.48it/s][A
10it [00:06,  1.48it/s][A
11it [00:07,  1.48it/s][A
12it [00:08,  1.48it/s][A
13it [00:08,  1.47it/s][A
14it [00:09,  1.47it/s][A
15it [00:10,  1.47it/s][A
16it [00:10,  1.47it/s][A
17it [00:11,  1.46it/s][A
18it [00:12,  1.46it/s][A
19it [00:12,  1.47it/s][A
20it [00:13,  1.47it/s][A
21it [00:14,  1.47it/s][A
22it [00:14,  1.47it/s][A
23it [00:15,  1.47it/s][A
24it [00:16,  1.46it/s][A
25it [00:17,  1.46it/s][A
26it [00:17,  1.46it/s][A
27it [00:18,  1.45it/s][A
28it [00:19,  1.45it/s][A
29it [00:19,  1.45it/s][A
30it [00:20,  1.45it/s][A
31it [00:21,  1.45it/s][A
32it [00:21,  1.45it/s][A
33it [00:22,  1.45it/s][A
34it [00:23,  1.44it/s][A
35it [00:23,  1.44it/s][A
36it [00:24,  1.44it/s][A
37it [00:25,  


Best Model Saved!!



 40%|████      | 2/5 [06:35<09:55, 198.35s/it]


Checkpoint Model Saved!

Epoch: 02
Train Total Loss: 0.104 | Train F1 Score: 0.969
Valid Total Loss: 0.054 | Valid F1 Score: 0.984
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.37it/s][A
2it [00:01,  1.43it/s][A
3it [00:02,  1.46it/s][A
4it [00:02,  1.46it/s][A
5it [00:03,  1.46it/s][A
6it [00:04,  1.46it/s][A
7it [00:04,  1.46it/s][A
8it [00:05,  1.47it/s][A
9it [00:06,  1.47it/s][A
10it [00:06,  1.47it/s][A
11it [00:07,  1.47it/s][A
12it [00:08,  1.47it/s][A
13it [00:08,  1.47it/s][A
14it [00:09,  1.46it/s][A
15it [00:10,  1.46it/s][A
16it [00:10,  1.46it/s][A
17it [00:11,  1.47it/s][A
18it [00:12,  1.47it/s][A
19it [00:12,  1.47it/s][A
20it [00:13,  1.46it/s][A
21it [00:14,  1.46it/s][A
22it [00:15,  1.46it/s][A
23it [00:15,  1.46it/s][A
24it [00:16,  1.45it/s][A
25it [00:17,  1.45it/s][A
26it [00:17,  1.45it/s][A
27it [00:18,  1.45it/s][A
28it [00:19,  1.45it/s][A
29it [00:19,  1.45it/s][A
30it [00:20,  1.45it/s][A
31it [00:21,  1.45it/s][A
32it [00:21,  1.45it/s][A
33it [00:22,  1.45it/s][A
34it [00:23,  1.45it/s][A
35it [00:24,  1.45it/s][A
36it [00:24,  1.45it/s][A
37it [00:25,  


Checkpoint Model Saved!

Epoch: 03
Train Total Loss: 0.070 | Train F1 Score: 0.978
Valid Total Loss: 0.055 | Valid F1 Score: 0.980
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.50it/s][A
2it [00:01,  1.46it/s][A
3it [00:02,  1.46it/s][A
4it [00:02,  1.45it/s][A
5it [00:03,  1.45it/s][A
6it [00:04,  1.45it/s][A
7it [00:04,  1.45it/s][A
8it [00:05,  1.45it/s][A
9it [00:06,  1.45it/s][A
10it [00:06,  1.45it/s][A
11it [00:07,  1.45it/s][A
12it [00:08,  1.45it/s][A
13it [00:08,  1.45it/s][A
14it [00:09,  1.46it/s][A
15it [00:10,  1.46it/s][A
16it [00:10,  1.46it/s][A
17it [00:11,  1.46it/s][A
18it [00:12,  1.45it/s][A
19it [00:13,  1.45it/s][A
20it [00:13,  1.46it/s][A
21it [00:14,  1.45it/s][A
22it [00:15,  1.45it/s][A
23it [00:15,  1.45it/s][A
24it [00:16,  1.45it/s][A
25it [00:17,  1.45it/s][A
26it [00:17,  1.45it/s][A
27it [00:18,  1.45it/s][A
28it [00:19,  1.45it/s][A
29it [00:19,  1.45it/s][A
30it [00:20,  1.45it/s][A
31it [00:21,  1.45it/s][A
32it [00:22,  1.45it/s][A
33it [00:22,  1.45it/s][A
34it [00:23,  1.45it/s][A
35it [00:24,  1.45it/s][A
36it [00:24,  1.45it/s][A
37it [00:25,  


Checkpoint Model Saved!

Epoch: 04
Train Total Loss: 0.047 | Train F1 Score: 0.987
Valid Total Loss: 0.103 | Valid F1 Score: 0.978
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.50it/s][A
2it [00:01,  1.45it/s][A
3it [00:02,  1.45it/s][A
4it [00:02,  1.46it/s][A
5it [00:03,  1.46it/s][A
6it [00:04,  1.46it/s][A
7it [00:04,  1.46it/s][A
8it [00:05,  1.46it/s][A
9it [00:06,  1.45it/s][A
10it [00:06,  1.45it/s][A
11it [00:07,  1.45it/s][A
12it [00:08,  1.45it/s][A
13it [00:08,  1.45it/s][A
14it [00:09,  1.45it/s][A
15it [00:10,  1.45it/s][A
16it [00:11,  1.45it/s][A
17it [00:11,  1.45it/s][A
18it [00:12,  1.45it/s][A
19it [00:13,  1.45it/s][A
20it [00:13,  1.45it/s][A
21it [00:14,  1.45it/s][A
22it [00:15,  1.45it/s][A
23it [00:15,  1.45it/s][A
24it [00:16,  1.45it/s][A
25it [00:17,  1.45it/s][A
26it [00:17,  1.45it/s][A
27it [00:18,  1.45it/s][A
28it [00:19,  1.45it/s][A
29it [00:19,  1.45it/s][A
30it [00:20,  1.45it/s][A
31it [00:21,  1.45it/s][A
32it [00:22,  1.45it/s][A
33it [00:22,  1.45it/s][A
34it [00:23,  1.45it/s][A
35it [00:24,  1.45it/s][A
36it [00:24,  1.45it/s][A
37it [00:25,  


Checkpoint Model Saved!

Epoch: 05
Train Total Loss: 0.026 | Train F1 Score: 0.992
Valid Total Loss: 0.074 | Valid F1 Score: 0.978
--------------------





In [34]:
print(classification_report(best_target, best_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       247
           1       0.98      0.97      0.98       180
           2       0.99      0.97      0.98        73

    accuracy                           0.98       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.98      0.98       500



In [35]:
f1_score(best_target, best_pred, average='micro')

0.984

In [36]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [37]:
from glob import glob

In [38]:
for filepath in glob("*.pt"):
    print(filepath)

model_checkpoint_sci3.pt
model_checkpoint_sci4.pt
model_checkpoint_sci1.pt
model_checkpoint_sci5.pt
model_least_loss.pt
model_checkpoint_sci2.pt


In [41]:
# !cp -r model_least_loss.pt /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/

In [42]:
for filepath in glob("*.pt"):
    !cp -r $filepath /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/
    time.sleep(10)