# Task 3: Classification of COVID19 tweets containing symptoms

In [5]:
!pip install -q transformers contractions imbalanced-learn ekphrasis

## 1. Import all the necessary libraries and data files

In [6]:
import numpy as np
import pandas as pd

import warnings
import torch
import torch.nn as nn
import time

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from transformers import BertModel, BertTokenizerFast
# from transformers import RobertaTokenizerFast, RobertaModel
from transformers import AutoTokenizer, AutoModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None

In [7]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

In [8]:
# Load data
train = pd.read_csv(train_filename, sep="\t")
validation = pd.read_csv(val_filename, sep="\t")

In [9]:
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (9067, 3) and validation data is (500, 3)


In [10]:
# Train top 5 rows
train.head().style.set_caption("Task 3: Train dataset")

Unnamed: 0,tweet_id,tweet,label
0,13729,A growing number of Covid-19 patients whose symptoms were initially mild are now facing mysterious long-term neurological problems https://t.co/If2SgRduuw,Lit-News_mentions
1,12399,"Medical experts advise that symptoms of the novel coronavirus include fever, shortness of breath, and stinky smelly pits and feet 😳🤪",Lit-News_mentions
2,20056,"@drdavidsamadi Hubby/I:same symptoms n November 2019 after a weekend trip 2 Vegas where bus loads of Chinese tourists.1 day fever,3 days sore throat,several weeks of fatigue.He's healthy,I'm not: diabetes,hypertension,obese, respiratory issues @ 53. No meds/pneumonia,we believe was COVID-19",Nonpersonal_reports
3,10175,"1/x In the April 11 BC briefing Dr. Bonnie Henry had mentioned that there's now reports of neurologic complications after COVID-19 infection, even during recovery. There's now anecdotal reporting of neurological manifestations from WUHAN patients in JAMA:https://t.co/7spTyk7l2M",Lit-News_mentions
4,12179,Major study PHOSP-COVID investigates health impacts of #COVID19 on hospitalised patients including #mentalhealth & neurological problems. Find out more https://t.co/JLTrz0BA7f @OxfordHealthNHS https://t.co/jZ2kPyPqmS,Lit-News_mentions


## 2. Prepare the data - Clean & Prepare for Model

In [11]:
# Drop unwanted columns
train.drop(['tweet_id'], axis=1, inplace=True)
validation.drop(['tweet_id'], axis=1, inplace=True)

In [12]:
# Referred from: https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True, emojis=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [13]:
train['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in train.tweet]
validation['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in validation.tweet]

In [14]:
# Train top 5 rows after pre-processing
train[['label', 'clean_tweets']].head()

Unnamed: 0,label,clean_tweets
0,Lit-News_mentions,a growing number of covid - <number> patients whose symptoms were initially mild are now facing mysterious long - term neurological problems <url>
1,Lit-News_mentions,"medical experts advise that symptoms of the novel coronavirus include fever , shortness of breath , and stinky smelly pits and feet 😳 🤪"
2,Nonpersonal_reports,"<user> hubby / i : same symptoms n <date> after a weekend trip <number> vegas where bus loads of chinese tourists . <number> day fever , <number> days sore throat , several weeks of fatigue . he ' s healthy , i am not : diabetes , hypertension , obese , respiratory issues @ <number> . no meds / pneumonia , we believe was <allcaps> covid </allcaps> - <number>"
3,Lit-News_mentions,"<number> /x in the <date> bc briefing dr . bonnie henry had mentioned that there ' s now reports of neurologic complications after <allcaps> covid </allcaps> - <number> infection , even during recovery . there ' s now anecdotal reporting of neurological manifestations from <allcaps> wuhan </allcaps> patients in <allcaps> jama </allcaps> : <url>"
4,Lit-News_mentions,major study <allcaps> phosp </allcaps> - <allcaps> covid </allcaps> investigates health impacts of <hashtag> covid 19 </hashtag> on hospitalised patients including <hashtag> mental health </hashtag> & neurological problems . find out more <url> <user> <url>


In [15]:

BATCH_SIZE = 2
N_EPOCHS = 5

In [16]:
# Define BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [17]:
# Tokenize train and validation data
train_enc_ct = tokenizer.batch_encode_plus(train.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")
valid_enc_ct = tokenizer.batch_encode_plus(validation.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")

In [18]:
train_enc_ct.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [19]:
train_enc_ct.input_ids.shape, train_enc_ct.token_type_ids.shape, train_enc_ct.attention_mask.shape

(torch.Size([9067, 128]), torch.Size([9067, 128]), torch.Size([9067, 128]))

In [20]:
def get_dataloader_rob(encoding, target):
    data = (TensorDataset(encoding.input_ids, encoding.token_type_ids, encoding.attention_mask, target))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [21]:
# Transform the target variable

le = LabelEncoder()
train.label = le.fit_transform(train.label)
validation.label = le.transform(validation.label)

In [22]:
list(le.classes_)

['Lit-News_mentions', 'Nonpersonal_reports', 'Self_reports']

In [23]:
train_dataloader_ct = get_dataloader_rob(train_enc_ct, torch.tensor(train['label'].to_list()))
valid_dataloader_ct = get_dataloader_rob(valid_enc_ct, torch.tensor(validation['label'].to_list()))

In [24]:
# Sanity check that the tensors returned by the dataloader are correct
for batch in train_dataloader_ct:
    input_ids, type_ids, attn_mask, target = batch
    print(input_ids.shape, type_ids.shape, attn_mask.shape, target.shape)
    break

torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2])


## 3. Model Building - Roberta

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [26]:
class CTBERTclassifier(nn.Module):
    def __init__(self, transformer):
        super(CTBERTclassifier, self).__init__()
        self.transformer = transformer
        self.linear_layer = nn.Linear(1024, 3)
    
    def forward(self, ip_ids, type_ids, attn_mask):
        op = self.transformer(input_ids=ip_ids,
                              token_type_ids=type_ids,
                              attention_mask=attn_mask)
        return  self.linear_layer(op["pooler_output"])

In [27]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [28]:
transformer_ct = AutoModel.from_pretrained("digitalepidemiologylab/covid-twitter-bert")
model_ct = CTBERTclassifier(transformer_ct).to(device)
print(f"The model has {count_parameter(model_ct):,} trainable parameters.")

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The model has 335,144,963 trainable parameters.


In [29]:
# Define optimizer 
criterion_ct = torch.nn.CrossEntropyLoss()
optim_ct = torch.optim.AdamW(model_ct.parameters(), lr = 2e-5)

In [30]:
def train_model_ct(model, dataloader, clip=1.0):
    model.train()

    epoch_loss = 0
    batch_num = 0
    pred, target = [], []

    for index, batch in tqdm(enumerate(dataloader)):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch

        optim_ct.zero_grad()
        output = model(input_ids, type_ids, attn_mask)
        loss = criterion_ct(output, y)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim_ct.step()

        epoch_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro')

def evaluate_ct(model, dataloader):
    model.eval()

    epoch_loss = 0
    batch_num = 0
    pred, target = list(), list()

    for index, batch in enumerate(dataloader):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch
        
        with torch.no_grad():
            output = model(input_ids, type_ids, attn_mask)
            loss = criterion_ct(output, y)
            
            epoch_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro'), pred, target

In [31]:
best_valid_loss_ct = float('inf')
total_train_loss_ct, total_valid_loss_ct = list(), list()

In [32]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_f1_score = train_model_ct(model_ct, train_dataloader_ct)
    total_train_loss_ct.append(train_loss)

    valid_loss, valid_f1_score, pred, target = evaluate_ct(model_ct, valid_dataloader_ct)
    total_valid_loss_ct.append(valid_loss)

    if valid_loss < best_valid_loss_ct:
        best_valid_loss_ct = valid_loss
        best_pred, best_target = pred, target
        torch.save(model_ct.state_dict(), "model_least_loss_rob.pt")
        print("\nBest Model Saved!!\n")
    
    torch.save(model_ct.state_dict(), "model_checkpoint_rob" + str(epoch) + ".pt")
    print("Checkpoint Model Saved!\n")

    print(f"Epoch: {epoch+1:02}")
    print(f"Train Total Loss: {train_loss:.3f} | Train F1 Score: {train_f1_score:.3f}")
    print(f"Valid Total Loss: {valid_loss:.3f} | Valid F1 Score: {valid_f1_score:.3f}")
    print("-"*20)

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.38it/s][A
2it [00:01,  2.11it/s][A
3it [00:01,  2.54it/s][A
4it [00:01,  2.80it/s][A
5it [00:01,  2.96it/s][A
6it [00:02,  3.07it/s][A
7it [00:02,  3.16it/s][A
8it [00:02,  3.21it/s][A
9it [00:03,  3.24it/s][A
10it [00:03,  3.29it/s][A
11it [00:03,  3.31it/s][A
12it [00:04,  3.31it/s][A
13it [00:04,  3.32it/s][A
14it [00:04,  3.32it/s][A
15it [00:04,  3.33it/s][A
16it [00:05,  3.32it/s][A
17it [00:05,  3.33it/s][A
18it [00:05,  3.33it/s][A
19it [00:06,  3.33it/s][A
20it [00:06,  3.32it/s][A
21it [00:06,  3.33it/s][A
22it [00:07,  3.32it/s][A
23it [00:07,  3.33it/s][A
24it [00:07,  3.33it/s][A
25it [00:07,  3.32it/s][A
26it [00:08,  3.31it/s][A
27it [00:08,  3.30it/s][A
28it [00:08,  3.31it/s][A
29it [00:09,  3.32it/s][A
30it [00:09,  3.33it/s][A
31it [00:09,  3.33it/s][A
32it [00:10,  3.33it/s][A
33it [00:10,  3.33it/s][A
34it [00:10,  3.33it/s][A
35it [00:10,  3.33it/s][A
36it


Best Model Saved!!



 20%|██        | 1/5 [24:41<1:38:47, 1481.97s/it]

Checkpoint Model Saved!

Epoch: 01
Train Total Loss: 0.297 | Train F1 Score: 0.946
Valid Total Loss: 0.184 | Valid F1 Score: 0.964
--------------------



0it [00:00, ?it/s][A
1it [00:00,  2.52it/s][A
2it [00:00,  2.88it/s][A
3it [00:01,  3.00it/s][A
4it [00:01,  3.06it/s][A
5it [00:01,  3.10it/s][A
6it [00:01,  3.12it/s][A
7it [00:02,  3.14it/s][A
8it [00:02,  3.14it/s][A
9it [00:02,  3.13it/s][A
10it [00:03,  3.16it/s][A
11it [00:03,  3.15it/s][A
12it [00:03,  3.13it/s][A
13it [00:04,  3.13it/s][A
14it [00:04,  3.10it/s][A
15it [00:04,  3.10it/s][A
16it [00:05,  3.11it/s][A
17it [00:05,  3.11it/s][A
18it [00:05,  3.11it/s][A
19it [00:06,  3.12it/s][A
20it [00:06,  3.13it/s][A
21it [00:06,  3.12it/s][A
22it [00:07,  3.12it/s][A
23it [00:07,  3.13it/s][A
24it [00:07,  3.12it/s][A
25it [00:08,  3.12it/s][A
26it [00:08,  3.11it/s][A
27it [00:08,  3.10it/s][A
28it [00:09,  3.12it/s][A
29it [00:09,  3.12it/s][A
30it [00:09,  3.12it/s][A
31it [00:09,  3.11it/s][A
32it [00:10,  3.22it/s][A
33it [00:10,  3.19it/s][A
34it [00:10,  3.17it/s][A
35it [00:11,  3.16it/s][A
36it [00:11,  3.14it/s][A
37it [00:11,  

Checkpoint Model Saved!

Epoch: 02
Train Total Loss: 1.109 | Train F1 Score: 0.448
Valid Total Loss: 1.122 | Valid F1 Score: 0.360
--------------------



0it [00:00, ?it/s][A
1it [00:00,  2.47it/s][A
2it [00:00,  2.81it/s][A
3it [00:01,  2.95it/s][A
4it [00:01,  3.00it/s][A
5it [00:01,  3.05it/s][A
6it [00:02,  3.07it/s][A
7it [00:02,  3.07it/s][A
8it [00:02,  3.10it/s][A
9it [00:02,  3.10it/s][A
10it [00:03,  3.11it/s][A
11it [00:03,  3.12it/s][A
12it [00:03,  3.10it/s][A
13it [00:04,  3.11it/s][A
14it [00:04,  3.12it/s][A
15it [00:04,  3.11it/s][A
16it [00:05,  3.11it/s][A
17it [00:05,  3.12it/s][A
18it [00:05,  3.12it/s][A
19it [00:06,  3.12it/s][A
20it [00:06,  3.13it/s][A
21it [00:06,  3.13it/s][A
22it [00:07,  3.11it/s][A
23it [00:07,  3.12it/s][A
24it [00:07,  3.11it/s][A
25it [00:08,  3.10it/s][A
26it [00:08,  3.10it/s][A
27it [00:08,  3.11it/s][A
28it [00:09,  3.10it/s][A
29it [00:09,  3.11it/s][A
30it [00:09,  3.11it/s][A
31it [00:10,  3.11it/s][A
32it [00:10,  3.11it/s][A
33it [00:10,  3.12it/s][A
34it [00:11,  3.12it/s][A
35it [00:11,  3.11it/s][A
36it [00:11,  3.12it/s][A
37it [00:11,  

Checkpoint Model Saved!

Epoch: 03
Train Total Loss: 1.116 | Train F1 Score: 0.449
Valid Total Loss: 1.052 | Valid F1 Score: 0.494
--------------------



0it [00:00, ?it/s][A
1it [00:00,  2.78it/s][A
2it [00:00,  2.98it/s][A
3it [00:00,  3.05it/s][A
4it [00:01,  3.07it/s][A
5it [00:01,  3.09it/s][A
6it [00:01,  3.09it/s][A
7it [00:02,  3.13it/s][A
8it [00:02,  3.12it/s][A
9it [00:02,  3.13it/s][A
10it [00:03,  3.13it/s][A
11it [00:03,  3.14it/s][A
12it [00:03,  3.14it/s][A
13it [00:04,  3.14it/s][A
14it [00:04,  3.15it/s][A
15it [00:04,  3.14it/s][A
16it [00:05,  3.13it/s][A
17it [00:05,  3.12it/s][A
18it [00:05,  3.11it/s][A
19it [00:06,  3.12it/s][A
20it [00:06,  3.12it/s][A
21it [00:06,  3.12it/s][A
22it [00:07,  3.12it/s][A
23it [00:07,  3.11it/s][A
24it [00:07,  3.12it/s][A
25it [00:08,  3.13it/s][A
26it [00:08,  3.12it/s][A
27it [00:08,  3.13it/s][A
28it [00:08,  3.13it/s][A
29it [00:09,  3.14it/s][A
30it [00:09,  3.12it/s][A
31it [00:09,  3.12it/s][A
32it [00:10,  3.12it/s][A
33it [00:10,  3.11it/s][A
34it [00:10,  3.09it/s][A
35it [00:11,  3.09it/s][A
36it [00:11,  3.07it/s][A
37it [00:11,  

Checkpoint Model Saved!

Epoch: 04
Train Total Loss: 1.123 | Train F1 Score: 0.446
Valid Total Loss: 1.060 | Valid F1 Score: 0.494
--------------------



0it [00:00, ?it/s][A
1it [00:00,  2.33it/s][A
2it [00:00,  2.76it/s][A
3it [00:01,  2.92it/s][A
4it [00:01,  2.98it/s][A
5it [00:01,  3.02it/s][A
6it [00:02,  3.05it/s][A
7it [00:02,  3.09it/s][A
8it [00:02,  3.09it/s][A
9it [00:02,  3.11it/s][A
10it [00:03,  3.11it/s][A
11it [00:03,  3.13it/s][A
12it [00:03,  3.11it/s][A
13it [00:04,  3.11it/s][A
14it [00:04,  3.12it/s][A
15it [00:04,  3.13it/s][A
16it [00:05,  3.13it/s][A
17it [00:05,  3.13it/s][A
18it [00:05,  3.12it/s][A
19it [00:06,  3.12it/s][A
20it [00:06,  3.11it/s][A
21it [00:06,  3.11it/s][A
22it [00:07,  3.10it/s][A
23it [00:07,  3.11it/s][A
24it [00:07,  3.10it/s][A
25it [00:08,  3.09it/s][A
26it [00:08,  3.11it/s][A
27it [00:08,  3.11it/s][A
28it [00:09,  3.12it/s][A
29it [00:09,  3.12it/s][A
30it [00:09,  3.13it/s][A
31it [00:10,  3.11it/s][A
32it [00:10,  3.12it/s][A
33it [00:10,  3.13it/s][A
34it [00:11,  3.14it/s][A
35it [00:11,  3.13it/s][A
36it [00:11,  3.14it/s][A
37it [00:11,  

Checkpoint Model Saved!

Epoch: 05
Train Total Loss: 1.113 | Train F1 Score: 0.449
Valid Total Loss: 1.129 | Valid F1 Score: 0.494
--------------------





In [33]:
print(classification_report(best_target, best_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       247
           1       0.99      0.95      0.97       180
           2       1.00      0.88      0.93        73

    accuracy                           0.96       500
   macro avg       0.98      0.94      0.96       500
weighted avg       0.97      0.96      0.96       500



In [34]:
from sklearn.metrics import f1_score
f1_score(best_target, best_pred, average='micro')

0.964

In [35]:
from google.colab import drive

In [37]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [38]:
from glob import glob

In [40]:
from glob import glob
for filepath in glob("*.pt"):
    !cp -r $filepath /content/gdrive/My\ Drive/Colab\ Notebooks/
    time.sleep(10)

In [None]:
# !cp -r model_least_loss_rob.pt /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/

In [None]:
# with open('/content/gdrive/My Drive/', 'w') as handle:
#     handle.write()