# Importing Necesseties

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Credit: https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705/notebook

#Goal of this notebook: How to tokenize the data, create question answer targets, and how to build a custom question answer head for RoBERTa
# in TensorFlow. Note that HuggingFace transformers don't have a TFRobertaForQuestionAnswering so we must make our own from TFRobertaModel.

# Here's a pro tip for people using TPU. Start each fold loop with-
# tf.tpu.experimental.initialize_tpu_system(tpu)
# This will prevent the TPU from running out of memory during 5 Fold.

#v5: got .706 score with max_len 192

In [None]:
import re
import string
import os
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version',tf.__version__)

# Make functions to reading the Data

In [None]:
def load_data_of_train():
    train=pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
    train['text'] = train['text'].astype(str) #ensuring data type is string to avoid any error
    train['selected_text'] = train['selected_text'].astype(str)
    return train

def load_data_of_test():
    test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
#     
    return test

def load_data_of_submission():
    sub = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
    return sub

In [None]:
# quick check of the datasets
train_data= load_data_of_train()
train_data.head()

In [None]:
train_data['sentiment'].unique()

In [None]:
train_data.sentiment.value_counts()

In [None]:
train_data.info()

In [None]:
train_data.dropna(inplace=True) #The dropna() function is used to remove missing values.

In [None]:
train_data.info()

In [None]:
test_data = load_data_of_test()
sample_submission_data = load_data_of_submission() 

In [None]:
test_data.info()

In [None]:
sample_submission_data.info()

# The Size of train&test 

In [None]:
print("The size of Train is:",train_data.shape)
print("The size of Test is:",test_data.shape)

In [None]:
def jaccard (text1, text2):
    a = set(text1.split())
    b = set(text2.split())
    intresection = a.intersection(b)
    IOU =(float) (len(intresection))/(len(a)+len(b)-len(intresection))
    return IOU

In [None]:
jaccard_list=[]
def calc_jaccard():
    for row in train_data.itertuples():
        jaccard_list.append(jaccard (row.text, row.selected_text))
    return jaccard_list;

In [None]:
jac = calc_jaccard()
train_data['jaccard'] = jac

In [None]:
train_data

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train_data[train_data['sentiment']=='neutral']['jaccard'],kde=False)

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train_data[train_data['sentiment']=='positive']['jaccard'], shade=True, color="b").set_title('Jaccard Scores across different Sentiments')
p2=sns.kdeplot(train_data[train_data['sentiment']=='negative']['jaccard'], shade=True, color="r")
plt.legend(labels=['positive','negative'])

In [None]:
train_data['no_words'] = train_data['text'].apply(lambda x:len(str(x).split()))
lessThanThree = train_data[train_data['no_words']<=2]

In [None]:
lessThanThree[lessThanThree['sentiment']=='negative']

In [None]:
final_train=load_data_of_train()
final_test=load_data_of_test()
final_sample_submission=load_data_of_submission()

final_train

In [None]:
final_train['no_words'] = final_train['text'].apply(lambda x:len(str(x).split()))
final_train = final_train[final_train['no_words']>=3]

In [None]:
final_train

In [None]:
MAX_LEN = 96 #try max_len=192 for longer training otherwise use 96
PATH = ''
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file='../input/tf-roberta/vocab-roberta-base.json', 
    merges_file='../input/tf-roberta/merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
# tokenizer.encode('positive').ids
# tokenizer.encode('negative').ids
# tokenizer.encode('neutral').ids

#encoded values of  a particular sentiment
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974} 

In [None]:
def definitions(Count,flag):
    d = dict()
    d['input_ids'] = np.ones((Count,MAX_LEN),dtype='int32')
    d['attention_mask'] = np.zeros((Count,MAX_LEN),dtype='int32')
    d['token_type_ids'] = np.zeros((Count,MAX_LEN),dtype='int32')
    if(flag):
        d['start_tokens'] = np.zeros((Count,MAX_LEN),dtype='int32')
        d['end_tokens'] = np.zeros((Count,MAX_LEN),dtype='int32')
    return d

In [None]:
# required step to transform data into RoBERTa format
# print(final_train.shape[0])
count_row = final_train.shape[0]
# 1 for tokens and 0 for padding 
data_definitions = definitions(count_row, True)
input_ids = data_definitions['input_ids']
attention_mask = data_definitions['attention_mask']
token_type_ids = data_definitions['token_type_ids']
start_tokens = data_definitions['start_tokens']
end_tokens = data_definitions['end_tokens']

In [None]:
iterate=0
# the K represent the index and i represent the data of row

for k,col in final_train.iterrows():  
    
    
    # FIND OVERLAP
    text1 = " "+" ".join(col['text'].split())
    #print("text1",text1)
    text2 = " ".join(col['selected_text'].split()) #final_train.loc[k,'selected_text'].split()
    #print("text2",text2)
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    #print("offset",offsets)
    # START END TOKENS
    toks = [] #store the index of word which common between text and select_text
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
    #print("toks",toks)
    
    s_tok = sentiment_id[final_train.loc[k]['sentiment']] #store the type of sentiment for each row      #final_train.loc[k,'sentiment']
    #print("s_tok",s_tok)
    input_ids[iterate][:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[iterate][:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[iterate][toks[0]+1] = 1
        end_tokens[iterate][toks[-1]+1] = 1
    iterate = iterate + 1

Above code we combine those two column values (text & sentiment) together using [2,2], instead [2, 0] because-
in HuggingFace tokenizer, RoBERTa tokenization accepts the output like: [0] + ? + [2,2] + ? + [2]

In [None]:
# tokenize the test data also as we did above for train data
count_row = final_test.shape[0]

data_definitions = definitions(count_row,False)
input_ids_t = data_definitions['input_ids']
token_type_ids_t = data_definitions['token_type_ids']
attention_mask_t = data_definitions['attention_mask']

for k,col in final_test.iterrows():
        
    # INPUT_IDS
    text1 = " "+" ".join(col['text'].split()) #test_df.loc[k,'text']
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[col['sentiment']]
    #print("s_tok",s_tok)
    input_ids_t[k][:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k][:len(enc.ids)+5] = 1

We use a pretrained roBERTa base model and add a custom question answer head. First tokens are input into bert_model and we use BERT's first output, i.e. x[0] below. These are embeddings of all input tokens and have shape (batch_size, MAX_LEN, 768). Next we apply tf.keras.layers.Conv1D(filters=1, kernel_size=1) and transform the embeddings into shape (batch_size, MAX_LEN, 1). We then flatten this and apply softmax, so our final output from x1 has shape (batch_size, MAX_LEN). These are one hot encodings of the start tokens indicies (for selected_text). And x2 are the end tokens indicies.

In [None]:
# build a RoBERTa model
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained('../input/tf-roberta/config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained('../input/tf-roberta/pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    return model

Uncomment below cell if you want to train the model.
Here we train with 5 Stratified KFolds (based on sentiment stratification). 
Each fold, the best model weights are saved and then reloaded before oof prediction and test prediction. 
Therefore you can run this code offline and upload your 5 fold models to a private Kaggle dataset. 
Then run this notebook and comment out the line model.fit(). 
Instead your notebook will load your model weights from offline training in the line model.load_weights(). 
Update this to have the correct path. Also make sure you change the KFold seed below to match your offline training. 
Then this notebook will proceed to use your offline models to predict oof and predict test.

Use inference in below cell if yo don't want to run above cell.
Otherwise comment/neglect below cell.
Here I am using my trained models with max len 192 got from above cell.

In [None]:
%%time
n_splits = 5
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))
DISPLAY=1
for i in range(5):
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
    model.load_weights('/kaggle/input/model4/v4-roberta-%i.h5'%i)
#     model.load_weights('/kaggle/input/roberta-trained-model-by-prateekg/v5-roberta-%i.h5'%i)

    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

In [None]:
# make submission file
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = final_test.loc[k,'text']
    else:
        text1 = " "+" ".join(final_test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [None]:
final_test['selected_text'] = all
final_test[['textID','selected_text']].to_csv('submission.csv',index=False)