In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd  /content/drive/MyDrive/Colab_Notebooks/DL

Mounted at /content/drive
/content/drive/MyDrive/Colab_Notebooks/DL


#Import libraries

In [None]:
!pip install transformers==3.5.1
!pip install torch==1.4.0

In [None]:
import random
import time
import datetime
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, accuracy_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup,Adafactor 
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [None]:
seed_val = 5
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Load dataa nd tokenize

Steps:

- Tokenize all the text with BertTokenizer (check max len of tweets)

- Get tokens and masks from BertTokenizer. Put the data into Tensor dataset (input_ids, attention_masks, labels)

- Divide in train and validation. Create a DataLoader for each set

- Train the model

In [None]:
data = pd.read_csv('Dataset/train.csv')
pred = pd.read_csv('Dataset/test.csv')
print(f'Number of labelled tweets: {data.shape[0]}\n')
display(data.sample(3))

Number of labelled tweets: 7613



Unnamed: 0,id,keyword,location,text,target
1014,1473,body%20bagging,,ÛÏ@MacDaddy_Leo: ?????? No Caption Needed ??....,1
7565,10814,wrecked,,Wrecked tired but not gonna be asleep before 3??,0
4406,6263,hijacking,,The ship has arrived safely. So it was quite u...,0


In [None]:
# Get the text from the df and the index to later divide it between train and test
labels = data['target'].values
sequences = data.text.values

In [None]:
# Get tokens using BertTokenizer. It does a mapping from the words to their IDs and 
# does padding or truncating depending on len_max. A mask is added, with 1's or 0's to
# distinguish between [PAD] tokens and the rest of the tokens.  
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

print('Original: ', sequences[12])
# Tokenizes sequences
print('Tokenized: ', tokenizer.tokenize(sequences[12]))
# ID's for each sentence
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequences[0])))

In [None]:
idx = int(sequences.shape[0]*0.9)
train= sequences[:idx]
test = sequences[idx:]
print(data.shape)
print(test.shape)

labels_training = labels[:idx]
labels_testing = labels[idx:]
print(labels_training.shape)
print(labels_testing.shape)

In [None]:
def tokenize(sequences,labels):

    ids,masks = [],[]
    for sequence in sequences:
        encoded = tokenizer.encode_plus(sequence,add_special_tokens = True,\
                       truncation='longest_first',max_length = 84,pad_to_max_length = True, \
                       return_attention_mask = True,return_tensors = 'pt')
        
        # Put it in the list of id's
        ids.append(encoded['input_ids'])
        # Same for the attention mask
        masks.append(encoded['attention_mask'])

    # Convert it to pytorch tensors
    ids = torch.cat(ids, dim=0)
    masks = torch.cat(masks, dim=0)

    labels = torch.tensor(labels)
    print(labels.shape)
    return ids,masks,labels

In [None]:
# Tokenize the sentences and get their IDs 
train_ids, train_masks, labels_training = tokenize_map(train, labels_training)
test_ids, test_masks, labels_test= tokenize_map(test, labels_test)

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels_training)

# split in 80% training, 20% val
train_size = int(train.shape[0]*0.8)
val_size = idx - train_size

# Randomize it
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(train_size,'training samples')
print(val_size,'validation samples')

# For batch size it's better 16 or 32
batch_size = 32

# Obtain dataloaders, for training random and for validation sequentially (order doesnt matter)
train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),\  
                    batch_size = batch_size)

validation_dataloader = DataLoader(val_dataset,sampler = SequentialSampler(val_dataset),\ 
                                   batch_size = batch_size)

prediction_data = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(prediction_data, sampler=SequentialSampler(prediction_data), batch_size=batch_size)

# Training

In [None]:
# Check if there's GPU

if torch.cuda.is_available():    
    device = torch.device('cuda')    
    print('Number of GPUs:',torch.cuda.device_count(),', Name: ',torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device('cpu')

Number of GPUs: 1 , Name:  Tesla T4


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-large-uncased',num_labels = 2,\
            output_attentions = False, output_hidden_states = False)
model.to(device)

In [None]:
# https://huggingface.co/docs/transformers/main_classes/optimizer_schedules

# optimizer = AdamW(model.parameters(),lr = 1e-4, eps = 1e-8)

# Replace AdamW with Adafactor
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,
)

In [None]:
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

In [None]:
def get_acc(preds, labels):
    return accuracy_score(labels.flatten() ,  np.argmax(preds, axis=1).flatten())

def get_f1_score(preds, labels):      
    return f1_score(labels.flatten() , np.argmax(preds, axis=1).flatten())

def get_time(time):    
    return str(datetime.timedelta(seconds=int(round((time)))))

In [None]:
# Based on https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# and https://www.kaggle.com/code/datafan07/disaster-tweets-nlp-eda-bert-with-transformers

training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    # Training
    print('\nEpoch {:}/{:}'.format(epoch_i + 1, epochs))

    t0 = time.time()
    total_train_loss = 0    
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            elapsed = get_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)

        model.zero_grad()        

        # Forward         
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        total_train_loss += loss.item()
        
        # Backward
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            

    training_time = get_time(time.time() - t0)
    print('\nAverage training loss: {0:.2f}'.format(avg_train_loss))
    print('Training epcoh took: {:}'.format(training_time))
        
    # Validation
    print ('validation: ')
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
             
        with torch.no_grad():        
            (loss, logits) = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,\
                                   labels=b_labels,return_dict=False)
            
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += get_acc(logits, label_ids)
        total_eval_f1 += get_f1_score(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.2f}'.format(avg_val_accuracy))
  
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.2f}'.format(avg_val_f1))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = get_time(time.time() - t0)
    print('  Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(   {  'epoch': epoch_i + 1,
                                'Training Loss': avg_train_loss,
                                'Valid. Loss': avg_val_loss,
                                'Valid. Accur.': avg_val_accuracy,
                                'Val_F1' : avg_val_f1,
                                'Training Time': training_time,
                                'Validation Time': validation_time   } )
print('\nDone!')
print('Total training took {:} (h:mm:ss)'.format(get_time(time.time()-total_t0)))

In [None]:
# Evaluation
model.eval()
predictions = []

for batch in test_dataloader:

  batch = tuple(t.to(device) for t in batch)
  b_ids, b_mask, = batch
  
  with torch.no_grad():
      outputs = model(b_ids, token_type_ids=None, 
                      attention_mask=b_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  predictions.append(logits)

# Results

In [None]:
pd.set_option('precision', 3)
results = pd.DataFrame(data=training_stats)
results = results.set_index('epoch')
results

In [None]:
# Get the predictions
preds = [element for prediction in predictions for element in prediction]
preds = np.argmax(preds, axis=1).flatten()

In [None]:
df_stats['test_acc'] = len(np.where(flat_predictions==labels_testing.numpy())[0])/len(flat_predictions)
df_stats['opt,lr'] ='Adafactor,1e-3'
df_stats

In [None]:
# save the model
# model.save_pretrained("model/local-model-checkpoint_2")

# save the info
df_stats.to_csv('model/local-model-checkpoint_5/df_stats.csv', index=True, header=True)

In [None]:
# The code was inspired by https://www.kaggle.com/code/datafan07/disaster-tweets-nlp-eda-bert-with-transformers