In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import torch 
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data.sampler import SequentialSampler
import transformers as ppb 
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Function to calculate the f1 of our predictions vs labels
def flat_fscore(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average='macro')
   

df = pd.read_csv("/home/joao/crisisLexT6.csv", encoding='utf-8')

print()
print('Number of sentences in the original dataset: {:,}\n'.format(df.shape[0]))

#Relabelling the columns titles to remove white spaces
df = df.rename(columns={' tweet': 'sentence'})
df = df.rename(columns={' label': 'label'})

df['label'].replace('on-topic', 1)
df['label'] = df['label'].replace('on-topic', 1)

df['label'].replace('off-topic', 0)
df['label'] = df['label'].replace('off-topic', 0)


labels = df['label'].values
sentences = df['sentence']


#Dropping useless columns as I will only be using the tweet text and the corresponding label
df = df[['sentence','label']]
print(df.keys())
print(df['label'].value_counts())

                        
### BERT

                        #### Doing all the text pre processing
        


# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')
    

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")   

#labels = df['label']
sentences = df['sentence']
sentences.head()


### Remove URL, RT, mention(@)
df.ProcessedText = df.sentence.str.replace(r'http(\S)+', r'')
df.ProcessedText = df.ProcessedText.str.replace(r'http ...', r'')
df.ProcessedText[df.ProcessedText.str.contains(r'http')]
df.ProcessedText = df.ProcessedText.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.ProcessedText[df.ProcessedText.str.contains(r'RT[ ]?@')]
df.ProcessedText = df.ProcessedText.str.replace(r'@[\S]+',r'')
df.ProcessedText = df.ProcessedText.str.replace(r'_[\S]?',r'')

#Remove extra space
df.ProcessedText = df.ProcessedText.str.replace(r'[ ]{2, }',r' ')

#Removing &, < and >
df.ProcessedText = df.ProcessedText.str.replace(r'&amp;?',r'and')

#Remove extra space
df.ProcessedText = df.ProcessedText.str.replace(r'&lt;',r'<')
df.ProcessedText = df.ProcessedText.str.replace(r'&gt;',r'>')

#Insert space between words and punctuation marks
df.ProcessedText = df.ProcessedText.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.ProcessedText = df.ProcessedText.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

#Lowercased and strip
df.ProcessedText = df.ProcessedText.str.lower()
df.ProcessedText = df.ProcessedText.str.strip()

sentences = df.ProcessedText

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 0
# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))    
print('Max sentence length: ', max_len)


#BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  
# Tokenization
tokenized = sentences.apply((lambda x: tokenizer.encode(x,add_special_tokens=True)))
#Padding
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
# Masking
attention_mask = np.where(padded != 0, 1, 0)



input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
labels = torch.tensor(df["label"].values).to(device)

dataset = TensorDataset(input_ids, attention_mask, labels)

# Create a 90-10 train-validation split. Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))


seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

 
batch_size = 32
train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset), batch_size = batch_size )
validation_dataloader = DataLoader(val_dataset,sampler = SequentialSampler(val_dataset),batch_size = batch_size)

training_stats = []

total_t0 = time.time()

epochs = 4

optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8 )


model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2,output_attentions = False, output_hidden_states = True)
model.to(device)

for epoch_i in range(0, epochs):  # For each epoch...
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time() ###Measure how long the training epoch takes.
    total_train_loss = 0 ### Reset the total loss for this epoch.    
    model.train()   ### Put the model into training mode.      
    for step, batch in enumerate(train_dataloader):   ### For each batch of training data...
        b_input_ids = batch[0].to(device)             ### `batch` contains three pytorch tensors:    #   [0]: input ids      #   [1]: attention masks    #   [2]: labels 
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        if step % 40 == 0 and not step == 0: ### Progress update every 40 batches.
            elapsed = format_time(time.time() - t0)                       
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        model.zero_grad()  ### Clear any previously calculated gradients before performing a backward pass. 
        loss, logits, hidden_states = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)   #### Perform a forward pass 
        total_train_loss += loss.item() ### Accumulate the training loss over all of the batches
        loss.backward() ### Perform a backward pass to calculate the gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) ### Clip the norm of the gradients to 1.0 to help prevent the "exploding gradients" problem.                        
        optimizer.step()  ### Update parameters and take a step using the computed gradient.
    avg_train_loss = total_train_loss / len(train_dataloader) ### Calculate the average loss over all of the batches.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Running Validation...")
    model.eval() ### Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    total_eval_accuracy = 0
    total_eval_fscore = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:  # Unpack this training batch from our dataloader.         
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():  ### Tell pytorch not to bother with constructing the compute graph during  the forward pass, since this is only needed for backprop (training).
            loss, logits, hidden_states = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)      
        total_eval_loss += loss.item() ### Accumulate the validation loss.
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids) ###  Calculate the accuracy for this batch of test sentences, and accumulate it over all batches.                 
        total_eval_fscore += flat_fscore(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) ### Report the final accuracy for this validation run.
    avg_val_fscore = total_eval_fscore / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))    
    print("  F1-Score: {0:.2f}".format(avg_val_fscore))    
    avg_val_loss = total_eval_loss / len(validation_dataloader) ### Calculate the average loss over all of the batches.
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    validation_time = format_time(time.time() - t0) ### Measure how long the validation run took.
    print("  Validation took: {:}".format(validation_time))    
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))  