### another BERT embeddings visualization

In [None]:
#!pip install livelossplot
#!pip install emoji 

In [2]:
!nvidia-smi

Thu Sep 16 16:34:06 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 471.41       Driver Version: 471.41       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| 98%   47C    P2   307W / 450W |   6183MiB / 24576MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from transformers import BertForSequenceClassification,BertTokenizerFast,AdamW,logging
import torch

from livelossplot import PlotLosses

pd.options.display.max_colwidth = 1000
pd.set_option('display.expand_frame_repr', False)

import re,emoji
import imageio,glob

In [45]:
from transformers import CamembertTokenizer, CamembertModel, CamembertConfig, CamembertForSequenceClassification, CamembertForMultipleChoice


In [5]:
# manually seed RNGs for reproducibility of your results
torch.manual_seed(1)
print()




In [6]:
!test -d /tmp/tweeteval || git clone https://github.com/cardiffnlp/tweeteval /tmp/tweeteval

'test' n�est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.
Cloning into '/tmp/tweeteval'...


In [7]:
def load_df(text_path,label_path):
    with open(text_path,'rt') as fi:
        texts = fi.read().strip().split('\n')
    text_dfs = pd.Series(data=texts,name='text',dtype='str')
    labels_dfs = pd.read_csv(label_path,names=['label'],index_col=False).label
    ret_df = pd.concat([text_dfs,labels_dfs],axis=1)
    return ret_df

train_df = load_df('/tmp/tweeteval/datasets/hate/train_text.txt','/tmp/tweeteval/datasets/hate/train_labels.txt').head(1500)
val_df = load_df('/tmp/tweeteval/datasets/hate/val_text.txt','/tmp/tweeteval/datasets/hate/val_labels.txt').head(1500)
test_df = load_df('/tmp/tweeteval/datasets/hate/test_text.txt','/tmp/tweeteval/datasets/hate/test_labels.txt').head(1500)

In [8]:
train_df.head()

Unnamed: 0,text,label
0,@user nice new signage. Are you not concerned by Beatlemania -style hysterical crowds crongregating on you…,0
1,A woman who you fucked multiple times saying yo dick small is a compliment you know u hit that spot 😎,1
2,@user @user real talk do you have eyes or were they gouged out by a rapefugee?,1
3,your girlfriend lookin at me like a groupie in this bitch!,1
4,Hysterical woman like @user,0


In [10]:
def encode_urls(row):
    row.text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))","HTTPURL", row.text)
    return row

def encode_mentions_hashtags(row):
    row.text = row.text.replace('@',' @')
    row.text = re.sub(r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)","@USER", row.text)
    row.text = row.text.replace('#',' ')
    return row

def encode_emojis(row):
    row.text = emoji.demojize(row.text)
    return row

def remove_extra_spaces(row):
    row.text = ' '.join(row.text.split())
    return row

def lower_text(row):
    row.text = row.text.lower()
    return row

def preprocess_data_df(df):
    df = df.apply(encode_urls,axis=1)
    df = df.apply(encode_mentions_hashtags,axis=1)
    df = df.apply(encode_emojis,axis=1)
    df = df.apply(remove_extra_spaces,axis=1)
    df = df.apply(lower_text,axis=1)
    return df

In [11]:
train_df = preprocess_data_df(train_df)
val_df = preprocess_data_df(val_df)
test_df = preprocess_data_df(test_df)

In [12]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def get_bert_encoded_data_in_batches(df,batch_size = 0,max_seq_length = 50):
    global tokenizer
    data = [(row.text,row.label,) for _,row in df.iterrows()]
    sampler = torch.utils.data.sampler.SequentialSampler(data)
    batch_sampler = torch.utils.data.BatchSampler(sampler,batch_size=batch_size if batch_size > 0 else len(data), drop_last=False)
    for batch in batch_sampler:
        encoded_batch_data = tokenizer.batch_encode_plus([data[i][0] for i in batch],max_length = max_seq_length,pad_to_max_length=True,truncation=True)
        seq = torch.tensor(encoded_batch_data['input_ids'])
        mask = torch.tensor(encoded_batch_data['attention_mask'])
        yield (seq,mask),torch.LongTensor([data[i][1] for i in batch])


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [29]:
ft_cmb_model = CamembertModel.from_pretrained("model/model_out", output_attentions=True,  output_hidden_states=True)
ft_cmb_tokenizer = CamembertTokenizer.from_pretrained("model/model_out")

In [47]:
twt_cls_model5 =  CamembertForSequenceClassification.from_pretrained("twt_cls_model5/model_out", output_attentions=True,  output_hidden_states=True)

In [84]:
sentence_b = "les pontes commencent d'être déposées sur maïs grain en zones précoces et le seuil d'intervention sera atteint prochainement."
sentence_a = "En raison des fortes températures de ces derniers jours, le vol s'est nettement intensifié et sera très groupé."


In [85]:
inputs = ft_cmb_tokenizer.encode_plus(sentence_a, None, return_tensors='pt', add_special_tokens=True)


In [86]:

input_ids = inputs['input_ids']
cls_embeddings, cls_hidden_states, cls_all_layer_embeddings = twt_cls_model5(input_ids)

input_id_list = input_ids[0].tolist() # Batch index 0
tokens = ft_cmb_tokenizer.convert_ids_to_tokens(input_id_list)
ft_all_layer_embeddings = ft_cmb_model(input_ids)[-1]

In [88]:
len(tokens)

25

In [89]:
#Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads
#Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
cls_all_layer_embeddings[11].shape

torch.Size([1, 12, 25, 25])

In [97]:
#logits (torch.FloatTensor of shape (batch_size, config.num_labels)) – Classification (or regression if config.num_labels==1) scores (before SoftMax).
cls_embeddings.shape

torch.Size([1, 1])

In [95]:
#Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).
#Hidden-states of the model at the output of each layer plus the initial embedding outputs.
cls_hidden_states[11].shape

torch.Size([1, 25, 768])

In [82]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [14]:
EPOCHS = 5 #@param {type:"slider", min:0, max:10, step:1}
LEARNING_RATE = 0.00005 #@param [1e-5,5e-5,1e-4] {type:"raw"}
BATCH_SIZE = 16 #@param [8,16,32,64] {type:"raw"}
MAX_SEQ_LEN = 50   #@param [50,100,512] {type:"raw"}

In [15]:
dim_reducer = TSNE(n_components=2)
#dim_reducer = PCA(n_components=2)

def visualize_layerwise_embeddings(hidden_states,masks,ys,title,layers_to_visualize=[0,1,2,3,8,9,10,11]):
    global dim_reducer
    !mkdir -p /tmp/plots/{title}
    num_layers = len(layers_to_visualize)
    fig = plt.figure(figsize=(24,(num_layers/4)*6)) #each subplot of size 6x6
    ax = [fig.add_subplot(num_layers/4,4,i+1) for i in range(num_layers)]
    ys = ys.numpy().reshape(-1)
    for i,layer_i in enumerate(layers_to_visualize):#range(hidden_states):
        layer_hidden_states = hidden_states[layer_i]
        averaged_layer_hidden_states = torch.div(layer_hidden_states.sum(dim=1),masks.sum(dim=1,keepdim=True))
        layer_dim_reduced_vectors = dim_reducer.fit_transform(averaged_layer_hidden_states.numpy())
        df = pd.DataFrame.from_dict({'x':layer_dim_reduced_vectors[:,0],'y':layer_dim_reduced_vectors[:,1],'label':ys})
        df.label = df.label.astype(int)
        sns.scatterplot(data=df,x='x',y='y',hue='label',ax=ax[i])
        ax[i].set_title(f"layer {layer_i+1}")
    plt.savefig(f'plots/{title}',format='png',pad_inches=0)
    print()

In [None]:
visualize_layerwise_embeddings(cls_hidden_states, cls_all_layer_embeddings)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
print(model)

In [None]:
%%time
loss_function = torch.nn.NLLLoss()
optimizer = AdamW(lr=LEARNING_RATE,params=model.parameters())
liveloss = PlotLosses()
for epoch in range(EPOCHS+1):

    model.train(False)  #toggle model in eval mode
    with torch.no_grad():
        train_correct_preds,train_total_preds,train_total_loss = 0,0,0.0
        train_masks,train_ys = torch.zeros(0,MAX_SEQ_LEN),torch.zeros(0,1)
        train_hidden_states = None
        for x,y in get_bert_encoded_data_in_batches(train_df,BATCH_SIZE,MAX_SEQ_LEN):
            sent_ids,masks = x
            sent_ids = sent_ids.to(device)
            masks = masks.to(device)
            y = y.to(device)
            model_out = model(sent_ids,masks,output_hidden_states=True,return_dict=True)
            log_probs = torch.nn.functional.log_softmax(model_out.logits, dim=1)
            loss = loss_function(log_probs, y)
            hidden_states = model_out.hidden_states[1:]
            
            train_total_loss += (loss.detach() * y.shape[0])
            train_preds = torch.argmax(log_probs,dim=1)
            train_correct_preds += (train_preds == y).float().sum()
            train_total_preds += train_preds.shape[0]

            train_masks = torch.cat([train_masks,masks.cpu()])
            train_ys = torch.cat([train_ys,y.cpu().view(-1,1)])

            if type(train_hidden_states) == type(None):
                train_hidden_states = tuple(layer_hidden_states.cpu() for layer_hidden_states in hidden_states)
            else:
                train_hidden_states = tuple(torch.cat([layer_hidden_state_all,layer_hidden_state_batch.cpu()])for layer_hidden_state_all,layer_hidden_state_batch in zip(train_hidden_states,hidden_states))
        
        visualize_layerwise_embeddings(train_hidden_states,train_masks,train_ys,epoch,'train_data')

        train_acc = train_correct_preds.float() / train_total_preds
        train_loss = train_total_loss / train_total_preds
        logs['loss'] = train_loss.item()
        logs['acc'] = train_acc.item()
        #
        val_correct_preds,val_total_preds,val_total_loss = 0,0,0.0
        val_masks,val_ys = torch.zeros(0,MAX_SEQ_LEN),torch.zeros(0,1)
        val_hidden_states = None
        for x,y in get_bert_encoded_data_in_batches(val_df,BATCH_SIZE,MAX_SEQ_LEN):
            sent_ids,masks = x
            sent_ids = sent_ids.to(device)
            masks = masks.to(device)
            y = y.to(device)
            model_out = model(sent_ids,masks,output_hidden_states=True,return_dict=True)
            log_probs = torch.nn.functional.log_softmax(model_out.logits, dim=1)
            loss = loss_function(log_probs, y)
            hidden_states = model_out.hidden_states[1:]
            #logging logic
            val_total_loss += (loss.detach() * y.shape[0])
            val_preds = torch.argmax(log_probs,dim=1)
            val_correct_preds += (val_preds == y).float().sum()
            val_total_preds += val_preds.shape[0]

            val_masks = torch.cat([val_masks,masks.cpu()])
            val_ys = torch.cat([val_ys,y.cpu().view(-1,1)])

            if type(val_hidden_states) == type(None):
                val_hidden_states = tuple(layer_hidden_states.cpu() for layer_hidden_states in hidden_states)
            else:
                val_hidden_states = tuple(torch.cat([layer_hidden_state_all,layer_hidden_state_batch.cpu()])for layer_hidden_state_all,layer_hidden_state_batch in zip(val_hidden_states,hidden_states))
        
        visualize_layerwise_embeddings(val_hidden_states,val_masks,val_ys,epoch,'val_data')
        val_acc = val_correct_preds.float() / val_total_preds
        val_loss = val_total_loss / val_total_preds
        logs['val_loss'] = val_loss.item()
        logs['val_acc'] = val_acc.item()
    if epoch:   #no need to learning-curve plot on 0th epoch
        liveloss.update(logs)
        liveloss.send()