In [None]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from keras.layers import Dense, Flatten, Conv1D, Dropout, Input
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import *
import tokenizers

In [None]:
data_path = '../input/tweet-sentiment-extraction'
path = '../input/roberta-base'
path2 = '../input/tf-roberta'
max_len = 128
vocab_file = path + '/vocab.json'
merges_file = path + '/merges.txt'
config_file = path2 + '/config-roberta-base.json'
pretrained_file = path2 + '/pretrained-roberta-base.h5'

In [None]:
#tokenizer for roberta
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file, 
    merges_file, 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive':tokenizer.encode('positive').ids[0], 
                'negative':tokenizer.encode('negative').ids[0], 
                'neutral':tokenizer.encode('neutral').ids[0]}

In [None]:
train_data = pd.read_csv(data_path + '/train.csv')
train_data.dropna(axis = 0,inplace=True)
train_data.head()

In [None]:
test_data = pd.read_csv(data_path + '/test.csv')
test_data.head()

In [None]:
#Tokenizing the training data (input data formating for training)

train_data.reset_index(inplace=True)
tot_tw = train_data.shape[0]

input_ids = np.ones((tot_tw, max_len), dtype='int32')
attention_mask = np.zeros((tot_tw, max_len), dtype='int32')
token_type_ids = np.zeros((tot_tw, max_len), dtype='int32')
start_mask = np.zeros((tot_tw, max_len), dtype='int32')
end_mask = np.zeros((tot_tw, max_len), dtype='int32')

for i in range(tot_tw):
    set1 = " "+" ".join(train_data.loc[i,'text'].split())
    set2 = " ".join(train_data.loc[i,'selected_text'].split())
    idx = set1.find(set2)
    set2_loc = np.zeros((len(set1)))
    set2_loc[idx:idx+len(set2)]=1
    if set1[idx-1]==" ":
        set2_loc[idx-1]=1
  
    enc_set1 = tokenizer.encode(set1)

    selected_text_token_idx=[]
    for k,(a,b) in enumerate(enc_set1.offsets):
        sm = np.sum(set2_loc[a:b]) 
        if sm > 0:
            selected_text_token_idx.append(k)

    senti_token = sentiment_id[train_data.loc[i,'sentiment']]
    input_ids[i,:len(enc_set1.ids)+5] = [0]+enc_set1.ids+[2,2]+[senti_token]+[2] 
    attention_mask[i,:len(enc_set1.ids)+5]=1

    if len(selected_text_token_idx) > 0:
        start_mask[i,selected_text_token_idx[0]+1]=1
        end_mask[i, selected_text_token_idx[-1]+1]=1

In [None]:
#Tokenizing the test data exactly the same way as the training data

tot_test_tw = test_data.shape[0]

input_ids_t = np.ones((tot_test_tw,max_len), dtype='int32')
attention_mask_t = np.zeros((tot_test_tw,max_len), dtype='int32')
token_type_ids_t = np.zeros((tot_test_tw,max_len), dtype='int32')

for i in range(tot_test_tw):
    set1 = " "+" ".join(test_data.loc[i,'text'].split())
    enc_set1 = tokenizer.encode(set1)

    s_token = sentiment_id[test_data.loc[i,'sentiment']]
    input_ids_t[i,:len(enc_set1.ids)+5]=[0]+enc_set1.ids+[2,2]+[s_token]+[2]
    attention_mask_t[i,:len(enc_set1.ids)+5]=1

In [None]:
#Metric Function
# Categorical Cross Entropy with Label Smoothing
# Label Smoothing is done to enhance accuracy
def custom_loss(y_true, y_pred):
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits = False, label_smoothing = 0.20)
    loss = tf.reduce_mean(loss)
    return loss

In [None]:
#Function for building the model
def build_model():
        ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
        att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
        tok =  tf.keras.layers.Input((max_len,), dtype=tf.int32) 

        config_path = RobertaConfig.from_pretrained(config_file)
        # I used a pre-trained model here
        roberta_model = TFRobertaModel.from_pretrained(pretrained_file, config=config_path)
        x = roberta_model(ids, attention_mask = att, token_type_ids=tok)
        
        x1 = tf.keras.layers.Dropout(0.05)(x[0])
        x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)  #128 is the no. of filters and 2 is the kernel size of each filter
        x1 = tf.keras.layers.LeakyReLU()(x1)
        x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
        x1 = tf.keras.layers.Dense(1)(x1)
        x1 = tf.keras.layers.Flatten()(x1)
        x1 = tf.keras.layers.Activation('softmax')(x1)
    
        x2 = tf.keras.layers.Dropout(0.05)(x[0]) 
        x2 = tf.keras.layers.Conv1D(128, 2,padding='same')(x2)
        x2 = tf.keras.layers.LeakyReLU()(x2)
        x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
        x2 = tf.keras.layers.Dense(1)(x2)
        x2 = tf.keras.layers.Flatten()(x2)
        x2 = tf.keras.layers.Activation('softmax')(x2)


        model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
        model.compile(loss=custom_loss, optimizer=optimizer)

        return model

In [None]:
#Training the model with 5 stratified kFolds

pred_start= np.zeros((input_ids_t.shape[0],max_len))
pred_end= np.zeros((input_ids_t.shape[0],max_len))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)

for i,(idxT,idxV) in enumerate(skf.split(input_ids,train_data.sentiment.values)):
    print('--'*20)
    print('-- FOLD %i --'%(i+1))
    print('--'*20)
    K.clear_session()
    model = build_model()
    '''
    sv = tf.keras.callbacks.ModelCheckpoint(
        'roberta-%i.h5'%(i), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
        
    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=3, batch_size=32, verbose=DISPLAY, callbacks=[sv],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    '''
    model.load_weights('../input/tweet-model-data/roberta-%i.h5'%(i))
    pred = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=1)
    pred_start = pred_start + (pred[0]/5)
    pred_end = pred_end + (pred[1]/5) # 5 is the total no. of splits here

In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(pred_start[k,])
    b = np.argmax(pred_end[k,])
    if a>b: 
        st = test_data.loc[k,'text'] 
    else:
        text1 = " "+" ".join(test_data.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)
test_data['selected_text']=all
test_data.head(20)
test_data[['textID','selected_text']].to_csv('submission.csv', index=False)