# RoBERTa - Modelo HuggingFace preentrenado TF 

Tratando de mejorar la primera implementación de BERT con DistilBERT, usaremos el modelo entrenado de RoBERTa y la librería transformers. El modelo pre-entrenado de RoBERTa elegido será dla implementación de HuggingFace preentrenado en tensorflow..

In [None]:
import pandas as pd
import numpy as np
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS 
import re 
import string
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input,Flatten,Embedding,Lambda,Dropout,LSTM,Conv1D,Concatenate,Add
from tensorflow.keras.models import Model
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import keras.backend as K

In [None]:
!pip install transformers

In [None]:
from transformers import  RobertaConfig, TFRobertaModel
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

In [None]:
train=pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
train.head()

In [None]:
#Referencia https://www.kaggle.com/parulpandey/eda-and-preprocessing-for-bert

def clean(tweet):
    tweet = str(tweet)

    tweet=tweet.lower()

    #Remove html tags
    tweet=re.sub('<.*?>','',tweet)

    #Remove text in square brackets
    tweet=re.sub('\[.*?\]','',tweet)

    #Remove hyperlinks
    tweet=re.sub('https?://\S+|www\.\S+','',tweet)


    return tweet

In [None]:
train.dropna(inplace = True)
train["text"] = train["text"].apply(lambda x : x.strip())
train["selected_text"] = train["selected_text"].apply(lambda x : x.strip())

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val,Y_train,Y_val=train_test_split(train,train['sentiment'],
                                              test_size=0.2,random_state=42,stratify=train['sentiment'])
X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,
                                               test_size=0.2,random_state=42,stratify=X_train['sentiment'])

X_train.reset_index(inplace=True,drop=True)
X_val.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

Y_train=Y_train.reset_index(drop=True)
Y_val=Y_val.reset_index(drop=True)
Y_test=Y_test.reset_index(drop=True)

print('X_train shape',X_train.shape,' Y_train shape ',Y_train.shape)
print('X_val shape',X_val.shape,' Y_val shape ',Y_val.shape)
print('X_test shape',X_test.shape,' Y_test shape ',Y_test.shape)

In [None]:

# Definición de variables generales
MAX_LEN = 128
tokenizer = Tokenizer(BPE.from_file('../input/tf-roberta/vocab-roberta-base.json', '../input/tf-roberta/merges-roberta-base.txt'))
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [None]:
resPos = tokenizer.encode('How I new that thing')
resNeg = tokenizer.encode('negative')
resNeu = tokenizer.encode('neutral')

In [None]:
print(f'Representación ID positive: {resPos.ids}')
print(f'Representación ID negative: {resNeg.ids}')
print(f'Representación ID neutral: {resNeu.ids}')

In [None]:
tokenizer.decode(resPos.ids)

In [None]:
# Definición de sentimientos basados en el diccionario de RoBERTa
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [None]:

#Referencia: https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705

def createInputData(data,tokenizer):

    row = data.shape[0]
    input_ids = np.ones((row,MAX_LEN),dtype='int32')
    attention_mask = np.zeros((row,MAX_LEN),dtype='int32')
    token_type_ids = np.zeros((row,MAX_LEN),dtype='int32')
    start_tokens = np.zeros((row,MAX_LEN),dtype='int32')
    end_tokens = np.zeros((row,MAX_LEN),dtype='int32')

    for k in range(data.shape[0]):
        # Búsqueda del indice inicial
        text1 = " "+" ".join(data.loc[k,'text'].split())
        text2 = " ".join(data.loc[k,'selected_text'].split())
        idx = text1.find(text2)
        # Selección de las posiciones que ocupan los textos
        chars = np.zeros((len(text1)))
        chars[idx:idx+len(text2)]=1

        if text1[idx-1]==' ': 
            chars[idx-1] = 1 

        # Codificación del texto completo
        enc = tokenizer.encode(text1) 

        # Encuentro de offsets
        token_offsets=[]
        idx=0
        for i in enc.ids:
            word=tokenizer.decode([i])
            token_offsets.append((idx,idx+len(word)))
            idx+=len(word)

        # Definición de tokens de inicio y finalización
        target_idx = []
        for i,(o1,o2) in enumerate(token_offsets):
            if(sum(chars[o1:o2])>0):
                target_idx.append(i)  
        s_tok = sentiment_id[data.loc[k,'sentiment']]

        input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
        attention_mask[k,:len(enc.ids)+5] = 1

        #Se adiciona 1 a los tokens
        if len(target_idx)>0:
            start_tokens[k,target_idx[0]+1] = 1
            end_tokens[k,target_idx[-1]+1] = 1

    return (input_ids,attention_mask,token_type_ids,start_tokens,end_tokens)

In [None]:
#Convertir los datos de validación y pruebas en la entrada de RoBERTa
X_tr1,X_tr2,X_tr3,Y_tr1,Y_tr2=createInputData(X_train,tokenizer)
X_val1,X_val2,X_val3,Y_val1,Y_val2=createInputData(X_val,tokenizer)
X_te1,X_te2,X_te3,Y_te1,Y_te2=createInputData(X_test,tokenizer)

In [None]:
# Arquitectura de red reunal con RoBERTa
def build_model():
    '''Builds the model'''

    ids=Input((MAX_LEN),name='ids',dtype='int32')
    att_mask=Input((MAX_LEN),name='att_mask',dtype='int32')
    type_ids=Input((MAX_LEN),name='type_ids',dtype='int32')

    roberta_conf = RobertaConfig.from_pretrained('roberta-base')
    roberta_model = TFRobertaModel.from_pretrained('roberta-base',config=roberta_conf)

    bert_output=roberta_model([ids,att_mask,type_ids])

    dropout1=Dropout(0.1,name='dropout1')(bert_output[0])
    conv1d_1 = Conv1D(1,1,kernel_initializer=tf.keras.initializers.glorot_uniform(seed=20),name='conv1d_1')(dropout1)
    flatten_1 = Flatten(name='flatten_1')(conv1d_1)
    out_1 = tf.keras.layers.Activation('softmax',name='activation1')(flatten_1)

    dropout2=Dropout(0.1,name='dropout2')(bert_output[0])
    conv1d_2 = Conv1D(1,1,kernel_initializer=tf.keras.initializers.glorot_uniform(seed=20),name='conv1d_2')(dropout2)
    flatten_2 = Flatten(name='flatten_2')(conv1d_2)
    out_2 = tf.keras.layers.Activation('softmax',name='activation2')(flatten_2)

    model1 = Model(inputs=[ids, att_mask, type_ids], outputs=[out_1,out_2])

    return model1
  
model=build_model()

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, './ModeloRobertaRN.png',show_shapes=True)

In [None]:
import os
if not os.path.exists('./model-roberta'):
    os.makedirs('./model-roberta')

In [None]:
# Parámetros de RoBERTA para tensorboard
from tensorflow.keras.callbacks import TensorBoard
%load_ext tensorboard
!rm -rf ./logs/ 

log_dir='./model-roberta'
tensorboard_callback = TensorBoard(log_dir=log_dir,histogram_freq=1, write_graph=True)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint=ModelCheckpoint('./model-roberta/roberta.h5', monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')

In [None]:
# Compilación del modelo con optimizador Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
# Entrenamiento del modelo
callback=[tensorboard_callback,checkpoint]
model.fit([X_tr1,X_tr2,X_tr3],[Y_tr1,Y_tr2],
           validation_data=([X_val1,X_val2,X_val3],[Y_val1,Y_val2]),
           batch_size=32,epochs=4,callbacks=callback)

In [None]:
# Predecir con los datos de texto
start,end=model.predict([X_te1,X_te2,X_te3])

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:

def find_selected_text(data,tokenizer,start,end):
    '''Finds the selected text for the given tweet'''
    selected_text_list=[]
    for i in range(data.shape[0]):

        # Búsqueda de index
        start_idx=np.argmax(start[i])
        end_idx=np.argmax(end[i])


        # Encuentra el texto de la predicción a partir de los indices
        if (start_idx>end_idx):
            predicted_text=data.loc[i,'text']

        else:
            text1 = " "+" ".join(data.loc[i,'text'].split())
            tokens=tokenizer.encode(text1)
            predicted_text=tokenizer.decode(tokens.ids[start_idx-1:end_idx])        

        selected_text_list.append(predicted_text)

    return selected_text_list
    


In [None]:
selected_text=find_selected_text(X_test,tokenizer,start,end)
X_test['predicted_text']=selected_text

for i,(_,row) in enumerate(X_test.iterrows()):
    X_test.loc[i,'jaccard']=jaccard(row.selected_text,row.predicted_text)

X_test.head(10)

In [None]:
# Promedio del indice de Jaccard

pos_average=np.mean(X_test['jaccard'][X_test['sentiment']=='positive'])
print('Promedio del indice de jaccard para los sentimientos positivos  ',pos_average)

neg_average=np.mean(X_test['jaccard'][X_test['sentiment']=='negative'])
print('Promedio del indice de jaccard para los sentimientos negativos  ',neg_average)

neu_average=np.mean(X_test['jaccard'][X_test['sentiment']=='neutral'])
print('Promedio del indice de jaccard para los sentimientos neutrales  ',neu_average)

In [None]:
# Rendimiento general para el modelo de RoBERTa
print(np.mean(X_test['jaccard']))