# Task 4: Classification of tweets self-reporting potential COVID19 cases

In [1]:
!pip install -q transformers contractions imbalanced-learn ekphrasis

[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4.0 MB 5.3 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 80 kB 6.6 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 880 kB 34.6 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 77 kB 5.6 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6.6 MB 32.8 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 596 kB 42.2 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106 kB 39.3 MB/s 
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 287 kB 49.9 MB/s 
[K  

## 1. Import all the necessary libraries and data files

In [2]:
import numpy as np
import pandas as pd

import warnings
import torch
import torch.nn as nn
import time

from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizerFast
from transformers import RobertaTokenizerFast, RobertaModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None

In [3]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

In [4]:
# Load data
train = pd.read_csv(train_filename, sep="\t", names=["tweet_id", "user_id", "tweet", "label"])
validation = pd.read_csv(val_filename, sep="\t")

In [5]:
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (6465, 4) and validation data is (716, 4)


In [6]:
# Train top 5 rows
train.head().style.set_caption("Task 4: Train dataset")

Unnamed: 0,tweet_id,user_id,tweet,label
0,1239172732690014208,2391447188,We‚Äôre parking at the airport and my mom rolled down the window to speak to an attendant and my dad immediately said ‚Äúwe have the coronavirus sir‚Äù,0
1,1223737201030246402,1200539436167159809,I really didn‚Äôt expect this will go wide this way. I hope safety & health for all people of #Chine & whole world. We are just trying to show some support & respect to them as much we can especially doctors who bravely facing the dirty #coronaVirus.,0
2,1239385333319389185,838382730,"For those who believe they are immortal and continue to go out to the park without paying attention to the order to remain at home, these are the x-rays of a 28-year-old boy intubated in the ICU in my hospital for #coronavirus. Hint: the lungs are black, white is pneumonia",1
3,1236209435241938945,780855138,My flight from Jordan back to the US stops in Paris üòÇ will I be quarantined? Stay tuned to find out üòÇüòÇ #coronavirus,0
4,1233855551605440514,337103373,I went to the movies and the air was on. Now I'm out to eat and Olive Garden has the air on. I see these establishments are doing their best to fight the coronavirus.,0


## 2. Prepare the data - Clean & Prepare for Model

In [7]:
# Drop unwanted columns
train.drop(['tweet_id', 'user_id'], axis=1, inplace=True)
validation.drop(['tweet_id', 'user_id'], axis=1, inplace=True)

In [8]:
# Referred from: https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [9]:
train['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in train.tweet]
validation['clean_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in validation.tweet]

In [10]:
# Train top 5 rows after pre-processing
train[['label', 'clean_tweets']].head()

Unnamed: 0,label,clean_tweets
0,0,we ‚Äô re parking at the airport and my mom rolled down the window to speak to an attendant and my dad immediately said ‚Äú we have the coronavirus sir ‚Äù
1,0,i really didn ‚Äô t expect this will go wide this way . i hope safety & health for all people of <hashtag> chine </hashtag> & whole world . we are just trying to show some support & respect to them as much we can especially doctors who bravely facing the dirty <hashtag> corona virus </hashtag> .
2,1,"for those who believe they are immortal and continue to go out to the park without paying attention to the order to remain at home , these are the x - rays of a <number> - year - old boy intubated in the <allcaps> icu </allcaps> in my hospital for <hashtag> coronavirus </hashtag> . hint : the lungs are black , white is pneumonia"
3,0,my flight from jordan back to the us stops in paris üòÇ will i be quarantined ? stay tuned to find out üòÇ üòÇ <hashtag> coronavirus </hashtag>
4,0,i went to the movies and the air was on . now i am out to eat and olive garden has the air on . i see these establishments are doing their best to fight the coronavirus .


In [11]:
BATCH_SIZE = 32
N_EPOCHS = 5

In [12]:
# Define BERT tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [13]:
# Tokenize train and validation data
train_enc_rob = tokenizer.batch_encode_plus(train.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")
valid_enc_rob = tokenizer.batch_encode_plus(validation.clean_tweets.to_list(), padding="longest", truncation=True, max_length=128, return_tensors="pt")

In [14]:
train_enc_rob.keys()

dict_keys(['input_ids', 'attention_mask'])

In [15]:
train_enc_rob.input_ids.shape, train_enc_rob.attention_mask.shape

(torch.Size([6465, 128]), torch.Size([6465, 128]))

In [16]:
def get_dataloader_rob(encoding, target):
    data = (TensorDataset(encoding.input_ids, encoding.attention_mask, target))
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

In [17]:
train_dataloader_rob = get_dataloader_rob(train_enc_rob, torch.tensor(train['label'].to_list()))
valid_dataloader_rob = get_dataloader_rob(valid_enc_rob, torch.tensor(validation['label'].to_list()))

In [18]:
# Sanity check that the tensors returned by the dataloader are correct
for batch in train_dataloader_rob:
    input_ids, attn_mask, target = batch
    print(input_ids.shape, attn_mask.shape, target.shape)
    break

torch.Size([32, 128]) torch.Size([32, 128]) torch.Size([32])


## 3. Model Building - Roberta

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [20]:
class ROBERTAclassifier(nn.Module):
    def __init__(self, transformer):
        super(ROBERTAclassifier, self).__init__()
        self.transformer = transformer
        self.linear_layer = nn.Linear(768, 2)
    
    def forward(self, ip_ids, attn_mask):
        op = self.transformer(input_ids=ip_ids,
                              attention_mask=attn_mask)
        return  self.linear_layer(op["pooler_output"])

In [21]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [22]:
transformer_rob = RobertaModel.from_pretrained("roberta-base")
model_rob = ROBERTAclassifier(transformer_rob).to(device)
print(f"The model has {count_parameter(model_rob)} trainable parameters.")

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The model has 124647170 trainable parameters.


In [23]:
# Define optimizer 
criterion_rob = torch.nn.CrossEntropyLoss()
optim_rob = torch.optim.AdamW(model_rob.parameters(), lr = 2e-5)

In [None]:
def train_model_rob(model, dataloader, clip=1.0):
    model.train()

    epoch_loss = 0
    batch_num = 0
    pred, target = [], []

    for index, batch in tqdm(enumerate(dataloader)):
        batch = tuple(row.to(device) for row in batch)
        input_ids, attn_mask, y = batch

        optim_rob.zero_grad()
        output = model(input_ids, attn_mask)
        loss = criterion_rob(output, y)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optim_rob.step()

        epoch_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred)

def evaluate_rob(model, dataloader):
    model.eval()

    epoch_loss = 0
    batch_num = 0
    pred, target = list(), list()

    for index, batch in enumerate(dataloader):
        batch = tuple(row.to(device) for row in batch)
        input_ids, attn_mask, y = batch
        
        with torch.no_grad():
            output = model(input_ids, attn_mask)
            loss = criterion_rob(output, y)
            
            epoch_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            target.extend(y.tolist())
    
    return epoch_loss/batch_num, f1_score(target, pred), pred, target

In [None]:
best_valid_loss_rob = float('inf')
total_train_loss_rob, total_valid_loss_rob = list(), list()

In [None]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_f1_score = train_model_rob(model_rob, train_dataloader_rob)
    total_train_loss_rob.append(train_loss)

    valid_loss, valid_f1_score, pred, target = evaluate_rob(model_rob, valid_dataloader_rob)
    total_valid_loss_rob.append(valid_loss)

    if valid_loss < best_valid_loss_rob:
        best_valid_loss_rob = valid_loss
        best_pred, best_target = pred, target
        torch.save(model_rob.state_dict(), "model_least_loss_rob.pt")
        print("\nBest Model Saved!!\n")
    
    torch.save(model_rob.state_dict(), "model_checkpoint_rob" + str(epoch) + ".pt")
    print("Checkpoint Model Saved!\n")

    print(f"Epoch: {epoch+1:02}")
    print(f"Train Total Loss: {train_loss:.3f} | Train F1 Score: {train_f1_score:.3f}")
    print(f"Valid Total Loss: {valid_loss:.3f} | Valid F1 Score: {valid_f1_score:.3f}")
    print("-"*20)

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  1.15it/s][A
2it [00:01,  1.16it/s][A
3it [00:02,  1.31it/s][A
4it [00:03,  1.40it/s][A
5it [00:03,  1.45it/s][A
6it [00:04,  1.49it/s][A
7it [00:04,  1.51it/s][A
8it [00:05,  1.52it/s][A
9it [00:06,  1.53it/s][A
10it [00:06,  1.53it/s][A
11it [00:07,  1.54it/s][A
12it [00:08,  1.54it/s][A
13it [00:08,  1.54it/s][A
14it [00:09,  1.54it/s][A
15it [00:10,  1.54it/s][A
16it [00:10,  1.53it/s][A
17it [00:11,  1.51it/s][A
18it [00:12,  1.50it/s][A
19it [00:12,  1.50it/s][A
20it [00:13,  1.50it/s][A
21it [00:14,  1.50it/s][A
22it [00:14,  1.50it/s][A
23it [00:15,  1.48it/s][A
24it [00:16,  1.46it/s][A
25it [00:16,  1.48it/s][A
26it [00:17,  1.48it/s][A
27it [00:18,  1.49it/s][A
28it [00:18,  1.50it/s][A
29it [00:19,  1.50it/s][A
30it [00:20,  1.50it/s][A
31it [00:20,  1.50it/s][A
32it [00:21,  1.50it/s][A
33it [00:22,  1.50it/s][A
34it [00:22,  1.49it/s][A
35it [00:23,  1.49it/s][A
36it


Best Model Saved!!



 20%|‚ñà‚ñà        | 1/5 [02:33<10:15, 153.90s/it]

Checkpoint Model Saved!

Epoch: 01
Train Total Loss: 0.338 | Train F1 Score: 0.406
Valid Total Loss: 0.299 | Valid F1 Score: 0.591
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.34it/s][A
2it [00:01,  1.35it/s][A
3it [00:02,  1.36it/s][A
4it [00:02,  1.37it/s][A
5it [00:03,  1.37it/s][A
6it [00:04,  1.37it/s][A
7it [00:05,  1.37it/s][A
8it [00:05,  1.37it/s][A
9it [00:06,  1.37it/s][A
10it [00:07,  1.36it/s][A
11it [00:08,  1.36it/s][A
12it [00:08,  1.36it/s][A
13it [00:09,  1.35it/s][A
14it [00:10,  1.36it/s][A
15it [00:11,  1.36it/s][A
16it [00:11,  1.35it/s][A
17it [00:12,  1.35it/s][A
18it [00:13,  1.35it/s][A
19it [00:14,  1.34it/s][A
20it [00:14,  1.34it/s][A
21it [00:15,  1.34it/s][A
22it [00:16,  1.34it/s][A
23it [00:17,  1.34it/s][A
24it [00:17,  1.34it/s][A
25it [00:18,  1.33it/s][A
26it [00:19,  1.33it/s][A
27it [00:20,  1.32it/s][A
28it [00:20,  1.32it/s][A
29it [00:21,  1.32it/s][A
30it [00:22,  1.31it/s][A
31it [00:23,  1.32it/s][A
32it [00:23,  1.32it/s][A
33it [00:24,  1.32it/s][A
34it [00:25,  1.32it/s][A
35it [00:26,  1.32it/s][A
36it [00:26,  1.32it/s][A
37it [00:27,  


Best Model Saved!!



 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [05:13<07:50, 156.97s/it]

Checkpoint Model Saved!

Epoch: 02
Train Total Loss: 0.216 | Train F1 Score: 0.715
Valid Total Loss: 0.219 | Valid F1 Score: 0.748
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.32it/s][A
2it [00:01,  1.34it/s][A
3it [00:02,  1.36it/s][A
4it [00:02,  1.36it/s][A
5it [00:03,  1.36it/s][A
6it [00:04,  1.36it/s][A
7it [00:05,  1.37it/s][A
8it [00:05,  1.37it/s][A
9it [00:06,  1.37it/s][A
10it [00:07,  1.37it/s][A
11it [00:08,  1.37it/s][A
12it [00:08,  1.36it/s][A
13it [00:09,  1.36it/s][A
14it [00:10,  1.36it/s][A
15it [00:11,  1.36it/s][A
16it [00:11,  1.36it/s][A
17it [00:12,  1.36it/s][A
18it [00:13,  1.36it/s][A
19it [00:13,  1.35it/s][A
20it [00:14,  1.35it/s][A
21it [00:15,  1.35it/s][A
22it [00:16,  1.35it/s][A
23it [00:16,  1.35it/s][A
24it [00:17,  1.35it/s][A
25it [00:18,  1.35it/s][A
26it [00:19,  1.35it/s][A
27it [00:19,  1.35it/s][A
28it [00:20,  1.35it/s][A
29it [00:21,  1.35it/s][A
30it [00:22,  1.35it/s][A
31it [00:22,  1.34it/s][A
32it [00:23,  1.34it/s][A
33it [00:24,  1.34it/s][A
34it [00:25,  1.34it/s][A
35it [00:25,  1.34it/s][A
36it [00:26,  1.34it/s][A
37it [00:27,  

Checkpoint Model Saved!

Epoch: 03
Train Total Loss: 0.150 | Train F1 Score: 0.824
Valid Total Loss: 0.301 | Valid F1 Score: 0.722
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.44it/s][A
2it [00:01,  1.37it/s][A
3it [00:02,  1.36it/s][A
4it [00:02,  1.36it/s][A
5it [00:03,  1.36it/s][A
6it [00:04,  1.35it/s][A
7it [00:05,  1.35it/s][A
8it [00:05,  1.35it/s][A
9it [00:06,  1.35it/s][A
10it [00:07,  1.36it/s][A
11it [00:08,  1.35it/s][A
12it [00:08,  1.35it/s][A
13it [00:09,  1.36it/s][A
14it [00:10,  1.35it/s][A
15it [00:11,  1.36it/s][A
16it [00:11,  1.35it/s][A
17it [00:12,  1.35it/s][A
18it [00:13,  1.35it/s][A
19it [00:14,  1.35it/s][A
20it [00:14,  1.34it/s][A
21it [00:15,  1.34it/s][A
22it [00:16,  1.35it/s][A
23it [00:16,  1.35it/s][A
24it [00:17,  1.35it/s][A
25it [00:18,  1.35it/s][A
26it [00:19,  1.34it/s][A
27it [00:19,  1.35it/s][A
28it [00:20,  1.35it/s][A
29it [00:21,  1.33it/s][A
30it [00:22,  1.34it/s][A
31it [00:22,  1.34it/s][A
32it [00:23,  1.34it/s][A
33it [00:24,  1.34it/s][A
34it [00:25,  1.34it/s][A
35it [00:25,  1.35it/s][A
36it [00:26,  1.35it/s][A
37it [00:27,  

Checkpoint Model Saved!

Epoch: 04
Train Total Loss: 0.118 | Train F1 Score: 0.879
Valid Total Loss: 0.256 | Valid F1 Score: 0.728
--------------------



0it [00:00, ?it/s][A
1it [00:00,  1.44it/s][A
2it [00:01,  1.37it/s][A
3it [00:02,  1.36it/s][A
4it [00:02,  1.37it/s][A
5it [00:03,  1.36it/s][A
6it [00:04,  1.35it/s][A
7it [00:05,  1.36it/s][A
8it [00:05,  1.35it/s][A
9it [00:06,  1.35it/s][A
10it [00:07,  1.35it/s][A
11it [00:08,  1.36it/s][A
12it [00:08,  1.35it/s][A
13it [00:09,  1.35it/s][A
14it [00:10,  1.36it/s][A
15it [00:11,  1.36it/s][A
16it [00:11,  1.36it/s][A
17it [00:12,  1.36it/s][A
18it [00:13,  1.36it/s][A
19it [00:13,  1.36it/s][A
20it [00:14,  1.36it/s][A
21it [00:15,  1.36it/s][A
22it [00:16,  1.36it/s][A
23it [00:16,  1.36it/s][A
24it [00:17,  1.36it/s][A
25it [00:18,  1.36it/s][A
26it [00:19,  1.36it/s][A
27it [00:19,  1.36it/s][A
28it [00:20,  1.36it/s][A
29it [00:21,  1.36it/s][A
30it [00:22,  1.35it/s][A
31it [00:22,  1.35it/s][A
32it [00:23,  1.35it/s][A
33it [00:24,  1.35it/s][A
34it [00:25,  1.35it/s][A
35it [00:25,  1.35it/s][A
36it [00:26,  1.35it/s][A
37it [00:27,  

Checkpoint Model Saved!

Epoch: 05
Train Total Loss: 0.094 | Train F1 Score: 0.907
Valid Total Loss: 0.391 | Valid F1 Score: 0.712
--------------------





In [None]:
print(classification_report(best_target, best_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       594
           1       0.74      0.75      0.75       122

    accuracy                           0.91       716
   macro avg       0.85      0.85      0.85       716
weighted avg       0.91      0.91      0.91       716



In [None]:
!zip -r content.zip ../content/ 

  adding: ../content/ (stored 0%)
  adding: ../content/.config/ (stored 0%)
  adding: ../content/.config/.feature_flags_config.yaml (deflated 23%)
  adding: ../content/.config/gce (stored 0%)
  adding: ../content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: ../content/.config/.metricsUUID (stored 0%)
  adding: ../content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: ../content/.config/config_sentinel (stored 0%)
  adding: ../content/.config/logs/ (stored 0%)
  adding: ../content/.config/logs/2022.04.19/ (stored 0%)
  adding: ../content/.config/logs/2022.04.19/14.22.58.550404.log (deflated 86%)
  adding: ../content/.config/logs/2022.04.19/14.23.29.232715.log (deflated 54%)
  adding: ../content/.config/logs/2022.04.19/14.23.28.447853.log (deflated 55%)
  adding: ../content/.config/logs/2022.04.19/14.22.14.343189.log (deflated 91%)
  adding: ../content/.config/logs/2022.04.19/14.23.07.556024.log (deflated 54%)
  adding: ../content/.config/logs/2022.04.19/14.22.38.448952

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from glob import glob

In [None]:
for filepath in glob("*.pt"):
    !cp -r $filepath /content/gdrive/My\ Drive/Colab\ Notebooks/
    time.sleep(10)

In [None]:
# !cp -r model_least_loss_rob.pt /content/gdrive/My\ Drive/Colab\ Notebooks/NLP\ Final\ Project/

In [None]:
# with open('/content/gdrive/My Drive/', 'w') as handle:
#     handle.write()

In [24]:
# Loading the saved model
output_model = 'model_least_loss_rob.pt'

model_test = ROBERTAclassifier(transformer_rob).to(device)
model_test.load_state_dict(torch.load(output_model, map_location=device))

<All keys matched successfully>

In [None]:
# Training set

# Set model to evaluation
# model_test.eval()

# y_pred_train, y_true_train = [], []
# train_indexes_list = []

# for index, batch in enumerate(valid_dataloader_rob):
#     batch = tuple(row.to(device) for row in batch)
#     input_ids, attn_mask, target, indexes = batch
    
#     with torch.no_grad():
#         output = model_test(input_ids, attn_mask)
        
#         y_pred_train.extend(torch.argmax(output, -1).tolist())
#         y_true_train.extend(target.tolist())
#         train_indexes_list.extend(indexes.tolist())

In [None]:
# print(f"F1-score: {f1_score(y_true_train, y_pred_train)}\n", f"Classification report: \n{classification_report(y_true_train, y_pred_train)}", sep='\n')

F1-score: 0.9996512033484479

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       739
           1       1.00      1.00      1.00      1433

    accuracy                           1.00      2172
   macro avg       1.00      1.00      1.00      2172
weighted avg       1.00      1.00      1.00      2172



In [27]:
# Validation set
model_test.eval()
y_pred_valid, y_true_valid = [], []
valid_indexes_list = []

for index, batch in enumerate(valid_dataloader_rob):
    batch = tuple(row.to(device) for row in batch)
    input_ids, attn_mask, target = batch
    
    with torch.no_grad():
        output = model_test(input_ids, attn_mask)
        
        y_pred_valid.extend(torch.argmax(output, -1).tolist())
        y_true_valid.extend(target.tolist())
        # valid_indexes_list.extend(indexes.tolist())

In [28]:
print(f"F1-score: {f1_score(y_true_valid, y_pred_valid)}\n", f"Classification report: \n{classification_report(y_true_valid, y_pred_valid)}", sep='\n')

F1-score: 0.7479674796747967

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       594
           1       0.74      0.75      0.75       122

    accuracy                           0.91       716
   macro avg       0.85      0.85      0.85       716
weighted avg       0.91      0.91      0.91       716



In [29]:
validation['pred'] = y_pred_valid

In [30]:
pd.options.display.max_rows = None

In [31]:
validation[validation.label!=validation.pred]

Unnamed: 0,tweet,label,clean_tweets,pred
2,@Swanny1875 I've had a cough but there again I always get a cough. The doc said that if you're able to take a deep breath then it's definitely not coronavirus.,1,<user> i have had a cough but there again i always get a cough . the doc said that if you are able to take a deep breath then it ' s definitely not coronavirus .,0
14,"Dear Non-Quarantined nations: I‚Äôm a firm believer in the 5 second rule. As a kid I set my own broken fingers as they didn‚Äôt pass the doctor visit threshold. I‚Äôm no ü¶ã. Learn from our mistakes. Take #coronavirus seriously. Living through this is unbelievably scary. Love, Italy",0,"dear non - quarantined nations : i ‚Äô m a firm believer in the <number> second rule . as a kid i set my own broken fingers as they didn ‚Äô t pass the doctor visit threshold . i ‚Äô m no ü¶ã . learn from our mistakes . take <hashtag> coronavirus </hashtag> seriously . living through this is unbelievably scary . love , italy",1
18,"Someone told me u normally dont run a high fever with pneumonia plus I've had a headache the whole time. She said it's possible i have it just tested too soon. My worry, if you had coronavirus how would u know since they obviously have no tests? They asked her if she cond/",1,"someone told me u normally dont run a high fever with pneumonia plus i have had a headache the whole time . she said it ' s possible i have it just tested too soon . my worry , if you had coronavirus how would u know since they obviously have no tests ? they asked her if she cond /",0
21,"We are socially distanced, but in many ways, we are more connected than ever. Let's all reboot &amp; have a slow motion moment together. We got this. Read more on the blog. https://t.co/ppZoo8A7NS #coronavirus #covid19 #covid_19 #covid-19 #socialdistancing #socialdistancingworks https://t.co/0E1Su737O7",0,"we are socially distanced , but in many ways , we are more connected than ever . let us all reboot & have a slow motion moment together . we got this . read more on the blog . <url> <hashtag> coronavirus </hashtag> <hashtag> covid19 </hashtag> <hashtag> covid 19 </hashtag> <hashtag> co vid </hashtag> - <number> <hashtag> social distancing </hashtag> <hashtag> social distancing works </hashtag> <url>",1
22,i lowkey think i caught the #coronavirus at the #HoustonRodeo last night i been coughing up a storm today and i know i wasnt sick bc i was fine before the concert üòÇ,1,i lowkey think i caught the <hashtag> coronavirus </hashtag> at the <hashtag> houston rodeo </hashtag> last night i been coughing up a storm today and i know i wasnt sick bc i was fine before the concert üòÇ,0
24,Work in the travel industry while there is a virus outbreak is challenging and exhausting I can‚Äôt imagine working in hospital during these hard times #coronavirus,0,work in the travel industry while there is a virus outbreak is challenging and exhausting i can ‚Äô t imagine working in hospital during these hard times <hashtag> coronavirus </hashtag>,1
25,Today I came across my first customer who is self isolating due to coronavirus.. Thankfully I didn‚Äôt get too close or in the vicinity to be infected.. Seems she had been sent home to self isolate her family were appreciative of the visit I paid to them ..,1,today i came across my first customer who is self isolating due to coronavirus . <repeated> thankfully i didn ‚Äô t get too close or in the vicinity to be infected . <repeated> seems she had been sent home to self isolate her family were appreciative of the visit i paid to them . <repeated>,0
27,@SebGorka @RichardHaass To think this CoronaVirus from Wuhan China is not a bio warfare weapon we are being naive. Possibility exist that it started in a Wuhan Bio warfare lab in Wuhan China. Why did china keep our doctors out silence doctors reporters &amp; others? What are they hiding was it deliberate?,0,<user> <user> to think this coronavirus from wuhan china is not a bio warfare weapon we are being naive . possibility exist that it started in a wuhan bio warfare lab in wuhan china . why did china keep our doctors out silence doctors reporters & others ? what are they hiding was it deliberate ?,1
28,im soooo sick of hearing about the election &amp; we have 9 more months until it ends üò© by now everyone has picked their candidate &amp; there‚Äôs no changing their mind talk about something else please and not the coronavirus either,0,im so <elongated> sick of hearing about the election & we have <number> more months until it ends üò© by now everyone has picked their candidate & there ‚Äô s no changing their mind talk about something else please and not the coronavirus either,1
30,Uhhhh Erin Burnett needs a Coronavirus test. She can‚Äôt stop coughing omg this is awkward. I@ surprised Sanjay Gupta isn‚Äôt hiding under the desk,0,uh <elongated> erin burnett needs a coronavirus test . she can ‚Äô t stop coughing omg this is awkward . i @ surprised sanjay gupta isn ‚Äô t hiding under the desk,1


In [32]:
validation[validation.label==validation.pred]

Unnamed: 0,tweet,label,clean_tweets,pred
0,"I went to a buffet, a football game with 20,000 people, and karaoke today. If I don't have the #coronavirus after this, it don't exist or I'm already immune. Stay tuned!",0,"i went to a buffet , a football game with <number> people , and karaoke today . if i do not have the <hashtag> coronavirus </hashtag> after this , it do not exist or i am already immune . stay tuned !",0
1,people at the airport are going to think i have the damn Coronavirus... just wonderful,0,people at the airport are going to think i have the damn coronavirus . <repeated> just wonderful,0
3,Soon as I hear a cough I‚Äôm assuming it‚Äôs the Coronavirus back tf up !,0,soon as i hear a cough i ‚Äô m assuming it ‚Äô s the coronavirus back tf up !,0
4,"Can I self quarantine from people over obsessing about the Coronavirus!? Yes, we it Sharon! You read the Sun!",0,"can i self quarantine from people over obsessing about the coronavirus ? ! <repeated> yes , we it sharon ! you read the sun !",0
5,"Girl, 7, who lives in The Bronx is the YOUNGEST person in the US to be diagnosed with coronavirus https://t.co/cr1iYH7FUZ",0,"girl , <number> , who lives in the bronx is the <allcaps> youngest </allcaps> person in the us to be diagnosed with coronavirus <url>",0
6,When I went to buy a few bottles of hand sanitizer at Walmart 2 weeks ago there were only 3 on the shelf. I assumed they were other doomsday prepers. My husband thought I was being ridiculous. Now I‚Äôm seeing this on @eBay &amp; I‚Äôm happy with my decision. #handsanitizer #coronavirus https://t.co/RkfHjJDoc2,0,when i went to buy a few bottles of hand sanitizer at walmart <number> weeks ago there were only <number> on the shelf . i assumed they were other doomsday prepers . my husband thought i was being ridiculous . now i ‚Äô m seeing this on <user> & i ‚Äô m happy with my decision . <hashtag> hand sanitizer </hashtag> <hashtag> coronavirus </hashtag> <url>,0
7,"Exactly @JasonZocchi ! I have spent hours investigating this issue. It all started with my kids I self quarantined when they had all the symptoms, then a question from someone about where elderly people can get tested. Still no answers. #CoronavirusUSA #coronavirus #orleg",1,"exactly <user> ! i have spent hours investigating this issue . it all started with my kids i self quarantined when they had all the symptoms , then a question from someone about where elderly people can get tested . still no answers . <hashtag> coronavirus usa </hashtag> <hashtag> coronavirus </hashtag> <hashtag> or leg </hashtag>",1
8,"@specterm Not really. I have had the flu shot which should limit the extent of my illness, I know the treatment protocol for it, and it‚Äôs less likely to be ‚Äúhidden‚Äù so that I can take adequate precautions to protect my family and my newborn. The unknowns of coronavirus make me nervous.",0,"<user> not really . i have had the flu shot which should limit the extent of my illness , i know the treatment protocol for it , and it ‚Äô s less likely to be ‚Äú hidden ‚Äù so that i can take adequate precautions to protect my family and my newborn . the unknowns of coronavirus make me nervous .",0
9,If you been to China in th past month stay in your home and don‚Äôt come in contact with other people. I don‚Äôt need the coronavirus spreading to Nevada,0,if you been to china in th past month stay in your home and don ‚Äô t come in contact with other people . i don ‚Äô t need the coronavirus spreading to nevada,0
10,I got an international flight next month and this coronavirus is stressing me out!!! üò©,0,i got an international flight next month and this coronavirus is stressing me out ! <repeated> üò©,0


In [None]:
# training.loc[train_indexes_list, 'ADR'] = y_pred_train
# validation.loc[valid_indexes_list, 'ADR'] = y_pred_valid

In [None]:
# training.to_csv("training_data_with_ADR.csv")
# validation.to_csv("validation_data_with_ADR.csv")

In [None]:
# Cross-check that it is correctly mapped
print(f"F1-score: {f1_score(training.label, training.ADR)}\n", f"Classification report: \n{classification_report(training.label, training.ADR)}", sep='\n')

F1-score: 0.9996512033484479

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       739
           1       1.00      1.00      1.00      1433

    accuracy                           1.00      2172
   macro avg       1.00      1.00      1.00      2172
weighted avg       1.00      1.00      1.00      2172

