# Task 3: Classification of COVID19 tweets containing symptoms

In [1]:
!pip install -q transformers contractions imbalanced-learn ekphrasis

[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
[K     |████████████████████████████████| 80 kB 6.8 MB/s 
[K     |████████████████████████████████| 77 kB 6.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 38.2 MB/s 
[K     |████████████████████████████████| 880 kB 49.3 MB/s 
[K     |████████████████████████████████| 596 kB 46.7 MB/s 
[K     |████████████████████████████████| 287 kB 54.2 MB/s 
[K     |████████████████████████████████| 106 kB 50.1 MB/s 
[K     |████████████████████████████████| 45 kB 2.9 MB/s 
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
[K     |████████████████████████████████| 96 kB 5.8 MB/s 
[?25h  Building wheel for ekphrasis (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<

## 1. Import all the necessary libraries and data files

In [4]:
import numpy as np
import pandas as pd

import warnings
import torch
import torch.nn as nn
import time

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from transformers import BertModel, BertTokenizerFast
# from transformers import RobertaTokenizerFast, RobertaModel
from transformers import AutoTokenizer, AutoModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None

In [5]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

In [10]:
# Load data
train = pd.read_csv(train_filename, sep="\t", names=["tweet_id", "user_id", "tweet", "label"])
validation = pd.read_csv(val_filename, sep="\t")

In [11]:
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (6465, 4) and validation data is (716, 4)


In [12]:
# Train top 5 rows
train.head().style.set_caption("Task 4: Train dataset")

Unnamed: 0,tweet_id,user_id,tweet,label
0,1239172732690014208,2391447188,We’re parking at the airport and my mom rolled down the window to speak to an attendant and my dad immediately said “we have the coronavirus sir”,0
1,1223737201030246402,1200539436167159809,I really didn’t expect this will go wide this way. I hope safety & health for all people of #Chine & whole world. We are just trying to show some support & respect to them as much we can especially doctors who bravely facing the dirty #coronaVirus.,0
2,1239385333319389185,838382730,"For those who believe they are immortal and continue to go out to the park without paying attention to the order to remain at home, these are the x-rays of a 28-year-old boy intubated in the ICU in my hospital for #coronavirus. Hint: the lungs are black, white is pneumonia",1
3,1236209435241938945,780855138,My flight from Jordan back to the US stops in Paris 😂 will I be quarantined? Stay tuned to find out 😂😂 #coronavirus,0
4,1233855551605440514,337103373,I went to the movies and the air was on. Now I'm out to eat and Olive Garden has the air on. I see these establishments are doing their best to fight the coronavirus.,0


## 2. Prepare the data - Clean & Prepare for Model

In [13]:
# Drop unwanted columns
train.drop(['tweet_id'], axis=1, inplace=True)
validation.drop(['tweet_id'], axis=1, inplace=True)

In [14]:
# Referred from: https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True, emojis=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [15]:
train['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in train.tweet]
validation['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in validation.tweet]

In [16]:
# Train top 5 rows after pre-processing
train[['label', 'clean_tweets']].head()

Unnamed: 0,label,clean_tweets
0,0,we ’ re parking at the airport and my mom rolled down the window to speak to an attendant and my dad immediately said “ we have the coronavirus sir ”
1,0,i really didn ’ t expect this will go wide this way . i hope safety & health for all people of <hashtag> chine </hashtag> & whole world . we are just trying to show some support & respect to them as much we can especially doctors who bravely facing the dirty <hashtag> corona virus </hashtag> .
2,1,"for those who believe they are immortal and continue to go out to the park without paying attention to the order to remain at home , these are the x - rays of a <number> - year - old boy intubated in the <allcaps> icu </allcaps> in my hospital for <hashtag> coronavirus </hashtag> . hint : the lungs are black , white is pneumonia"
3,0,my flight from jordan back to the us stops in paris 😂 will i be quarantined ? stay tuned to find out 😂 😂 <hashtag> coronavirus </hashtag>
4,0,i went to the movies and the air was on . now i am out to eat and olive garden has the air on . i see these establishments are doing their best to fight the coronavirus .


In [17]:

BATCH_SIZE = 2
N_EPOCHS = 5

In [18]:
# Define BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [19]:
# Tokenize train and validation data
train_enc_ct = tokenizer.batch_encode_plus(train.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")
valid_enc_ct = tokenizer.batch_encode_plus(validation.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")

In [20]:
train_enc_ct.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [21]:
train_enc_ct.input_ids.shape, train_enc_ct.token_type_ids.shape, train_enc_ct.attention_mask.shape

(torch.Size([6465, 128]), torch.Size([6465, 128]), torch.Size([6465, 128]))

In [22]:
def get_dataloader_rob(encoding, target):
    data = (TensorDataset(encoding.input_ids, encoding.token_type_ids, encoding.attention_mask, target))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [23]:
# Transform the target variable

le = LabelEncoder()
train.label = le.fit_transform(train.label)
validation.label = le.transform(validation.label)

In [24]:
list(le.classes_)

[0, 1]

In [25]:
train_dataloader_ct = get_dataloader_rob(train_enc_ct, torch.tensor(train['label'].to_list()))
valid_dataloader_ct = get_dataloader_rob(valid_enc_ct, torch.tensor(validation['label'].to_list()))

In [26]:
# Sanity check that the tensors returned by the dataloader are correct
for batch in train_dataloader_ct:
    input_ids, type_ids, attn_mask, target = batch
    print(input_ids.shape, type_ids.shape, attn_mask.shape, target.shape)
    break

torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2, 128]) torch.Size([2])


## 3. Model Building - Roberta

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [28]:
class CTBERTclassifier(nn.Module):
    def __init__(self, transformer):
        super(CTBERTclassifier, self).__init__()
        self.transformer = transformer
        self.linear_layer = nn.Linear(1024, 2)
    
    def forward(self, ip_ids, type_ids, attn_mask):
        op = self.transformer(input_ids=ip_ids,
                              token_type_ids=type_ids,
                              attention_mask=attn_mask)
        return  self.linear_layer(op["pooler_output"])

In [29]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [30]:
transformer_ct = AutoModel.from_pretrained("digitalepidemiologylab/covid-twitter-bert")
model_ct = CTBERTclassifier(transformer_ct).to(device)
print(f"The model has {count_parameter(model_ct):,} trainable parameters.")

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The model has 335,143,938 trainable parameters.


In [31]:
# Define optimizer 
criterion_ct = torch.nn.CrossEntropyLoss()
optim_ct = torch.optim.AdamW(model_ct.parameters(), lr = 2e-5)

In [32]:
def train_model_ct(model, dataloader, clip=1.0):
    model.train()

    epoch_loss = 0
    batch_num = 0
    pred, target = [], []

    for index, batch in tqdm(enumerate(dataloader)):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch

        optim_ct.zero_grad()
        output = model(input_ids, type_ids, attn_mask)
        loss = criterion_ct(output, y)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim_ct.step()

        epoch_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro')

def evaluate_ct(model, dataloader):
    model.eval()

    epoch_loss = 0
    batch_num = 0
    pred, target = list(), list()

    for index, batch in enumerate(dataloader):
        batch = tuple(row.to(device) for row in batch)
        input_ids, type_ids, attn_mask, y = batch
        
        with torch.no_grad():
            output = model(input_ids, type_ids, attn_mask)
            loss = criterion_ct(output, y)
            
            epoch_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred, average='micro'), pred, target

In [33]:
best_valid_loss_ct = float('inf')
total_train_loss_ct, total_valid_loss_ct = list(), list()

In [34]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_f1_score = train_model_ct(model_ct, train_dataloader_ct)
    total_train_loss_ct.append(train_loss)

    valid_loss, valid_f1_score, pred, target = evaluate_ct(model_ct, valid_dataloader_ct)
    total_valid_loss_ct.append(valid_loss)

    if valid_loss < best_valid_loss_ct:
        best_valid_loss_ct = valid_loss
        best_pred, best_target = pred, target
        torch.save(model_ct.state_dict(), "model_least_loss_rob.pt")
        print("\nBest Model Saved!!\n")
    
    torch.save(model_ct.state_dict(), "model_checkpoint_rob" + str(epoch) + ".pt")
    print("Checkpoint Model Saved!\n")

    print(f"Epoch: {epoch+1:02}")
    print(f"Train Total Loss: {train_loss:.3f} | Train F1 Score: {train_f1_score:.3f}")
    print(f"Valid Total Loss: {valid_loss:.3f} | Valid F1 Score: {valid_f1_score:.3f}")
    print("-"*20)

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.15it/s][A
2it [00:01,  1.40it/s][A
3it [00:02,  1.50it/s][A
4it [00:02,  1.56it/s][A
5it [00:03,  1.59it/s][A
6it [00:03,  1.61it/s][A
7it [00:04,  1.62it/s][A
8it [00:05,  1.63it/s][A
9it [00:05,  1.64it/s][A
10it [00:06,  1.64it/s][A
11it [00:06,  1.64it/s][A
12it [00:07,  1.65it/s][A
13it [00:08,  1.64it/s][A
14it [00:08,  1.65it/s][A
15it [00:09,  1.65it/s][A
16it [00:09,  1.65it/s][A
17it [00:10,  1.65it/s][A
18it [00:11,  1.65it/s][A
19it [00:11,  1.65it/s][A
20it [00:12,  1.65it/s][A
21it [00:12,  1.65it/s][A
22it [00:13,  1.65it/s][A
23it [00:14,  1.65it/s][A
24it [00:14,  1.65it/s][A
25it [00:15,  1.65it/s][A
26it [00:16,  1.65it/s][A
27it [00:16,  1.64it/s][A
28it [00:17,  1.64it/s][A
29it [00:17,  1.65it/s][A
30it [00:18,  1.64it/s][A
31it [00:19,  1.65it/s][A
32it [00:19,  1.65it/s][A
33it [00:20,  1.64it/s][A
34it [00:20,  1.64it/s][A
35it [00:21,  1.64it/s][A
36it


Best Model Saved!!



 20%|██        | 1/5 [34:02<2:16:10, 2042.68s/it]

Checkpoint Model Saved!

Epoch: 01
Train Total Loss: 0.575 | Train F1 Score: 0.871
Valid Total Loss: 0.469 | Valid F1 Score: 0.895
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.41it/s][A
2it [00:01,  1.53it/s][A
3it [00:01,  1.57it/s][A
4it [00:02,  1.60it/s][A
5it [00:03,  1.61it/s][A
6it [00:03,  1.62it/s][A
7it [00:04,  1.63it/s][A
8it [00:04,  1.63it/s][A
9it [00:05,  1.63it/s][A
10it [00:06,  1.63it/s][A
11it [00:06,  1.63it/s][A
12it [00:07,  1.63it/s][A
13it [00:08,  1.63it/s][A
14it [00:08,  1.63it/s][A
15it [00:09,  1.63it/s][A
16it [00:09,  1.63it/s][A
17it [00:10,  1.63it/s][A
18it [00:11,  1.63it/s][A
19it [00:11,  1.63it/s][A
20it [00:12,  1.63it/s][A
21it [00:12,  1.63it/s][A
22it [00:13,  1.63it/s][A
23it [00:14,  1.63it/s][A
24it [00:14,  1.63it/s][A
25it [00:15,  1.63it/s][A
26it [00:16,  1.63it/s][A
27it [00:16,  1.63it/s][A
28it [00:17,  1.63it/s][A
29it [00:17,  1.63it/s][A
30it [00:18,  1.64it/s][A
31it [00:19,  1.63it/s][A
32it [00:19,  1.63it/s][A
33it [00:20,  1.62it/s][A
34it [00:20,  1.62it/s][A
35it [00:21,  1.62it/s][A
36it [00:22,  1.63it/s][A
37it [00:22,  


Best Model Saved!!



 40%|████      | 2/5 [1:08:06<1:42:10, 2043.41s/it]

Checkpoint Model Saved!

Epoch: 02
Train Total Loss: 0.385 | Train F1 Score: 0.914
Valid Total Loss: 0.419 | Valid F1 Score: 0.899
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.37it/s][A
2it [00:01,  1.51it/s][A
3it [00:01,  1.57it/s][A
4it [00:02,  1.59it/s][A
5it [00:03,  1.60it/s][A
6it [00:03,  1.61it/s][A
7it [00:04,  1.62it/s][A
8it [00:05,  1.62it/s][A
9it [00:05,  1.62it/s][A
10it [00:06,  1.62it/s][A
11it [00:06,  1.62it/s][A
12it [00:07,  1.62it/s][A
13it [00:08,  1.62it/s][A
14it [00:08,  1.62it/s][A
15it [00:09,  1.62it/s][A
16it [00:09,  1.62it/s][A
17it [00:10,  1.62it/s][A
18it [00:11,  1.62it/s][A
19it [00:11,  1.62it/s][A
20it [00:12,  1.62it/s][A
21it [00:13,  1.62it/s][A
22it [00:13,  1.62it/s][A
23it [00:14,  1.62it/s][A
24it [00:14,  1.63it/s][A
25it [00:15,  1.63it/s][A
26it [00:16,  1.62it/s][A
27it [00:16,  1.62it/s][A
28it [00:17,  1.62it/s][A
29it [00:17,  1.62it/s][A
30it [00:18,  1.63it/s][A
31it [00:19,  1.63it/s][A
32it [00:19,  1.63it/s][A
33it [00:20,  1.62it/s][A
34it [00:21,  1.62it/s][A
35it [00:21,  1.62it/s][A
36it [00:22,  1.62it/s][A
37it [00:22,  

Checkpoint Model Saved!

Epoch: 03
Train Total Loss: 0.380 | Train F1 Score: 0.920
Valid Total Loss: 0.521 | Valid F1 Score: 0.866
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.38it/s][A
2it [00:01,  1.51it/s][A
3it [00:01,  1.56it/s][A
4it [00:02,  1.59it/s][A
5it [00:03,  1.60it/s][A
6it [00:03,  1.61it/s][A
7it [00:04,  1.62it/s][A
8it [00:05,  1.62it/s][A
9it [00:05,  1.62it/s][A
10it [00:06,  1.62it/s][A
11it [00:06,  1.63it/s][A
12it [00:07,  1.63it/s][A
13it [00:08,  1.63it/s][A
14it [00:08,  1.62it/s][A
15it [00:09,  1.62it/s][A
16it [00:09,  1.63it/s][A
17it [00:10,  1.63it/s][A
18it [00:11,  1.63it/s][A
19it [00:11,  1.63it/s][A
20it [00:12,  1.63it/s][A
21it [00:13,  1.62it/s][A
22it [00:13,  1.62it/s][A
23it [00:14,  1.63it/s][A
24it [00:14,  1.63it/s][A
25it [00:15,  1.62it/s][A
26it [00:16,  1.63it/s][A
27it [00:16,  1.63it/s][A
28it [00:17,  1.63it/s][A
29it [00:17,  1.63it/s][A
30it [00:18,  1.63it/s][A
31it [00:19,  1.63it/s][A
32it [00:19,  1.62it/s][A
33it [00:20,  1.62it/s][A
34it [00:21,  1.62it/s][A
35it [00:21,  1.62it/s][A
36it [00:22,  1.62it/s][A
37it [00:22,  

Checkpoint Model Saved!

Epoch: 04
Train Total Loss: 0.719 | Train F1 Score: 0.849
Valid Total Loss: 0.801 | Valid F1 Score: 0.830
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.39it/s][A
2it [00:01,  1.52it/s][A
3it [00:01,  1.56it/s][A
4it [00:02,  1.58it/s][A
5it [00:03,  1.60it/s][A
6it [00:03,  1.61it/s][A
7it [00:04,  1.62it/s][A
8it [00:05,  1.62it/s][A
9it [00:05,  1.62it/s][A
10it [00:06,  1.62it/s][A
11it [00:06,  1.62it/s][A
12it [00:07,  1.61it/s][A
13it [00:08,  1.62it/s][A
14it [00:08,  1.62it/s][A
15it [00:09,  1.62it/s][A
16it [00:09,  1.62it/s][A
17it [00:10,  1.62it/s][A
18it [00:11,  1.62it/s][A
19it [00:11,  1.62it/s][A
20it [00:12,  1.63it/s][A
21it [00:13,  1.63it/s][A
22it [00:13,  1.63it/s][A
23it [00:14,  1.62it/s][A
24it [00:14,  1.62it/s][A
25it [00:15,  1.62it/s][A
26it [00:16,  1.62it/s][A
27it [00:16,  1.63it/s][A
28it [00:17,  1.62it/s][A
29it [00:17,  1.62it/s][A
30it [00:18,  1.62it/s][A
31it [00:19,  1.62it/s][A
32it [00:19,  1.62it/s][A
33it [00:20,  1.62it/s][A
34it [00:21,  1.63it/s][A
35it [00:21,  1.62it/s][A
36it [00:22,  1.62it/s][A
37it [00:22,  

Checkpoint Model Saved!

Epoch: 05
Train Total Loss: 0.747 | Train F1 Score: 0.841
Valid Total Loss: 0.790 | Valid F1 Score: 0.830
--------------------





In [35]:
print(classification_report(best_target, best_pred))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94       594
           1       0.66      0.84      0.74       122

    accuracy                           0.90       716
   macro avg       0.81      0.87      0.84       716
weighted avg       0.91      0.90      0.90       716



In [36]:
from sklearn.metrics import f1_score
f1_score(best_target, best_pred, average='micro')

0.8994413407821229

In [37]:
from google.colab import drive

In [41]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [39]:
from glob import glob

In [42]:
# from glob import glob
# for filepath in glob("*.pt"):
#     !cp -r $filepath /content/gdrive/My\ Drive/Colab\ Notebooks/
#     time.sleep(10)

In [46]:
!cp -r model_checkpoint_rob3.pt /content/gdrive/My\ Drive/Colab\ Notebooks/

In [None]:
# with open('/content/gdrive/My Drive/', 'w') as handle:
#     handle.write()