# Glove + RN

En esta etapa del proceso comenzaremos a implementar una serie de modelos para analizar su comportamiento ante la misma base de datos. De tal manera, que podamos identificar las diferencias y capacidades de cada uno.

In [None]:
NB_START_EPOCHS = 10  # Número de iteraciones de entrenamiento
BATCH_SIZE = 512  # Tamaño de grupos usados en entrenamiento
GLOVE_DIM = 100  # Número de dimensiones los embeddings GloVe
seq_length = 33

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input,Activation,Flatten,Embedding,GlobalAveragePooling1D,Dropout,LSTM,Conv1D
from tensorflow.keras.models import Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from wordcloud import WordCloud, STOPWORDS
import tensorflow as tf
import missingno as msno
from collections import defaultdict
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json

%matplotlib inline

In [None]:
#Reference https://www.kaggle.com/parulpandey/eda-and-preprocessing-for-bert

def clean(tweet):
    tweet = str(tweet)

    tweet=tweet.lower()

    #Remove html tags
    tweet=re.sub('<.*?>','',tweet)

    #Remove text in square brackets
    tweet=re.sub('\[.*?\]','',tweet)

    #Remove hyperlinks
    tweet=re.sub('https?://\S+|www\.\S+','',tweet)


    return tweet

In [None]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
target = train['sentiment']

In [None]:
train_model = train.copy()
train_model.dropna(inplace=True)
train_model['clean_text']=train_model['text'].apply(clean)
train_model['clean_selected_text']=train_model['selected_text'].apply(clean)
train_model = train_model.reset_index()
train_model.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val,Y_train,Y_val=train_test_split(train_model[['sentiment','clean_text','textID']],train_model['clean_selected_text'],
                                               test_size=0.2,random_state=42,stratify=train_model['sentiment'])

X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,
                                               test_size=0.2,random_state=42,stratify=X_train['sentiment'])

X_train.reset_index(inplace=True,drop=True)
X_val.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

Y_train=Y_train.reset_index(drop=True)
Y_val=Y_val.reset_index(drop=True)
Y_test=Y_test.reset_index(drop=True)

print('X_train Forma',X_train.shape,' Y_train Forma ',Y_train.shape)
print('X_val Forma',X_val.shape,' Y_val Forma ',Y_val.shape)
print('X_test Forma',X_test.shape,' Y_test Forma ',Y_test.shape)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='UNK',filters='')
tokenizer.fit_on_texts(X_train['clean_text'].values+' '+X_train['sentiment'].values)

X_train_clean_text = tokenizer.texts_to_sequences(X_train['sentiment'].values+' '+X_train['clean_text'].values)
X_val_clean_text = tokenizer.texts_to_sequences(X_val['sentiment'].values+' '+X_val['clean_text'].values)
X_test_clean_text = tokenizer.texts_to_sequences(X_test['sentiment'].values+' '+X_test['clean_text'].values)

X_train_tokens = tf.keras.preprocessing.sequence.pad_sequences(X_train_clean_text,maxlen=seq_length,padding='post')
X_val_tokens = tf.keras.preprocessing.sequence.pad_sequences(X_val_clean_text,maxlen=seq_length,padding='post')
X_test_tokens = tf.keras.preprocessing.sequence.pad_sequences(X_test_clean_text,maxlen=seq_length,padding='post')

print('The shape of X_train_tokens ',X_train_tokens.shape)
print('The shape of X_val_tokens ',X_val_tokens.shape)
print('The shape of X_test_tokens ',X_test_tokens.shape)

Una vez tokenizados los datos, es necesario identificar el entero asignado a cada sentimiento

In [None]:
print(X_train_tokens.T[0][0:3])
print(X_train['sentiment'].values[0:3])

Con los posibles sentimiento identificados, es necesario crear un diccionario para poder reperarlos o procesarlos en un futuro

In [None]:
# Creación de indices

def get_start_end_index(X_data,Y_data):
    start_index=np.zeros((X_data.shape[0],32),dtype='int32')
    end_index=np.zeros((X_data.shape[0],32),dtype='int32')

    for k in range(X_data.shape[0]):
        tx1=" ".join(X_data['clean_text'][k].split())
        tx2=" ".join(Y_data[k].split())
        # Se encuentra en indice de inicio y el de finalización
        idx=tx1.find(tx2)

        # Se insertan unos por cada caracter presente
        chars=np.zeros(len(tx1))
        chars[idx:idx+len(tx2)]=1

        # Creación de offsets con (inicio, fin) para cada palabra 
        offsets=[]
        j=0
        for i in tx1.split():
            offsets.append((j,j+len(i)+1))
            j+=len(i)+1

        vals=[]
        for i,(o1,o2) in enumerate(offsets):
            if(sum(chars[o1:o2])>0):
                vals.append(i)

        if(len(vals)>0 and len(vals)<=32):
            start_index[k,vals[0]]=1
            end_index[k,vals[-1]]=1
        else:
            start_index[k,0]=1
            end_index[k,-1]=1
    return start_index,end_index

In [None]:
Y_tr_1,Y_tr_2=get_start_end_index(X_train,Y_train)
Y_val_1,Y_val_2=get_start_end_index(X_val,Y_val)
Y_te_1,Y_te_2=get_start_end_index(X_test,Y_test)

In [None]:
X_train['clean_text'][129]

In [None]:
Y_train[129]

In [None]:
X_train_tokens[129]

In [None]:
Y_tr_1[129]

In [None]:
Y_tr_2[129]

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove-global-vectors-for-word-representation/glove.twitter.27B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('%s Vectores de palabras encontrados.' % len(embeddings_index))

In [None]:
vocab_size=len(tokenizer.word_index)+1
print(vocab_size)

In [None]:
embedding_matrix = np.zeros((vocab_size, GLOVE_DIM ))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:    
        embedding_matrix[i] = embedding_vector

In [None]:
import os
if not os.path.exists('./model-glove'):
    os.makedirs('./model-glove')

In [None]:
# Red neuronal convolucional

input_layer=Input((seq_length,),name='input')
embedding_layer=Embedding(vocab_size,100,weights=[embedding_matrix],input_length=seq_length,trainable=False)(input_layer)

conv1d=Conv1D(6,2,kernel_initializer=tf.keras.initializers.glorot_uniform(seed=20),name='conv1d')(embedding_layer)
dropout=Dropout(0.2,name="drop_out")(conv1d)
flatten=Flatten(name='flatten')(dropout)

output1=Dense(32,activation='softmax',kernel_initializer=tf.keras.initializers.glorot_uniform(seed=45),name='output1')(flatten)

output2=Dense(32,activation='softmax',kernel_initializer=tf.keras.initializers.glorot_uniform(seed=45),name='output2')(flatten)

simpleNeural=Model(inputs=[input_layer],outputs=[output1,output2])

simpleNeural.summary()

In [None]:
from tensorflow.keras.callbacks import TensorBoard
%load_ext tensorboard

log_dir='./model-glove/logs'
tensorboard_callback = TensorBoard(log_dir=log_dir,histogram_freq=1, write_graph=True)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="./model-glove/weights-{epoch:02d}-{val_loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

In [None]:
simpleNeural.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy())

In [None]:
callback=[tensorboard_callback,checkpoint]
simpleNeural.fit(X_train_tokens,[Y_tr_1,Y_tr_2],batch_size=32,epochs=20,callbacks=callback,
           validation_data=(X_val_tokens,[Y_val_1,Y_val_2]))

In [None]:
tf.keras.utils.plot_model(simpleNeural, './model-glove/model.png',show_shapes=True)

In [None]:
# Metrica definida

def jaccard(str1, str2):
  a = set(str1.lower().split()) 
  b = set(str2.lower().split())
  c = a.intersection(b)
  return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
st_idx,end_idx=simpleNeural.predict(X_test_tokens,batch_size=32,verbose=1)

In [None]:
def compute_jaccard(st_idx,end_idx,X,Y):
    all_jaccard=[]
    df=pd.DataFrame(columns=['clean_text','selected_text','predicted','jaccard','sentiment'])
    for i in range(len(st_idx)):
        initial=np.argmax(st_idx[i])
        final=np.argmax(end_idx[i])
        sent2=" ".join(X['clean_text'][i].split()[initial:final+1])
        sent1=Y[i] 
        df.loc[i,'clean_text']=X['clean_text'][i]   
        df.loc[i,'selected_text']=sent1
        df.loc[i,'predicted']=sent2
        df.loc[i,'sentiment']=X['sentiment'][i]
        jaccard_score=jaccard(sent1,sent2)
        df.loc[i,'jaccard']=jaccard_score
        all_jaccard.append(jaccard_score)
    return np.mean(np.array(all_jaccard)),df

In [None]:
score,df=compute_jaccard(st_idx,end_idx,X_test,Y_test)
print(score)