# TensorFlow roBERTa + CNN head

# Load  data and libraries

In [None]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers

In [None]:
def read_train():
    train=pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
    train['text']=train['text'].astype(str)
    train['selected_text']=train['selected_text'].astype(str)
    return train

def read_test():
    test=pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
    test['text']=test['text'].astype(str)
    return test

def read_submission():
    test=pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
    return test
    
train_df = read_train()
test_df = read_test()
submission_df = read_submission()

In [None]:
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Data preproccesing

In [None]:
MAX_LEN = 96
# MAX_LEN = 120
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [None]:
ct = train_df.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train_df.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train_df.loc[k,'text'].split())
    text2 = " ".join(train_df.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train_df.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1

In [None]:
ct = test_df.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test_df.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test_df.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test_df.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask_t[k,:len(enc.ids)+3] = 1

# Model

In [None]:
PAD_ID = 1
LABEL_SMOOTHING = 0.15

def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss

def build_model(alpha=0.30):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    
    x1 = tf.keras.layers.Dropout(0.25)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU(alpha=alpha)(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.25)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU(alpha=alpha)(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) 
    model.compile(loss=loss_fn, optimizer=optimizer)
    
    # this is required as `model.predict` needs a fixed size!
    x1_padded = tf.pad(x1, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    x2_padded = tf.pad(x2, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    
    padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    return model, padded_model

# Train
We will skip this stage and load already trained model

# Inference

In [None]:
import pickle
def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

In [None]:
%%time
n_splits = 10; DISPLAY = 1
preds_start = np.zeros((n_splits, input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((n_splits, input_ids_t.shape[0],MAX_LEN))

for i in range(n_splits):
    print('#'*25)
    print('### MODEL %i'% (i+1))
    print('#'*25)
    
    K.clear_session()
    model, padded_model = build_model()
    load_weights(padded_model, '../input/tse-seed88888/v0-roberta-%i.h5' %(i))
    print('Predicting Test...')
    preds = padded_model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start[i,:,:] = preds[0]
    preds_end[i,:,:] = preds[1]

In [None]:
answer = [0.57657865, 0.61908779, 0.61798954, 0.61765775, 0.41941072, 0.37508611]
starts = [5,7,6,2,8,0]; ends = [5,7,8,6,2,9]; weights = answer
preds_start_avg = np.average(preds_start[starts,], axis=0, weights=weights)
preds_end_avg = np.average(preds_end[ends,], axis=0, weights=weights)

In [None]:
def get_word_number(x):
    return len(x.selected_text.split())

def get_intersection_and_union(st1, st2):
    set1 = set(st1.lower().split())
    set2 = set(st2.lower().split())
    return ' '.join(list(set1.intersection(set2))), ' '.join(list(set1.union(set2)))

def is_subset(st1, st2):
    set1 = set(st1.lower().split())
    set2 = set(st2.lower().split())
    return set1.issubset(set2)

def get_the_word(st1, st2):
    st1 = st1.lower()
    st2 = st2.lower().split()
    for _ in st2:
        if st1 in _:
            return _
        
def get_diff(st1, st2):
    set1 = set(st1.lower().split())
    set2 = set(st2.lower().split())
    diff = set2.difference(set1)
    return ' '.join(list(diff))

In [None]:
import re
def post_process(x):
    if x.startswith('.'):
        x = re.sub("([\.]+)", '.', x, 1)
    if len(x.split()) == 1:
        x = x.replace('!!!!', '!')
        x = x.replace('???', '?')
        if x.endswith('...'):
            x = x.replace('..', '.')
            x = x.replace('...', '.')
        return x
    else:
        return x

In [None]:
all = []
all_bak = []
all_com = []
starts, starts_bak, ends, ends_bak = [], [], [], []
start_diffs = []
end_diffs = []

count = 0
count_abn = 0
count_abn_2 = 0
count_nm = 0
count_bak = 0
count_ori = 0
count_true = 0
count_zero = 0
length_true = []

for k in range(input_ids_t.shape[0]):
    a, a_bak= np.argsort(preds_start_avg[k,])[::-1][:2]
    b, b_bak = np.argsort(preds_end_avg[k,])[::-1][:2]
    diff_start = abs(preds_start_avg[k,a_bak] -  preds_start_avg[k,a])
    diff_end = abs(preds_end_avg[k,b_bak] -  preds_end_avg[k,b])
    starts.append(a);starts_bak.append(a_bak);ends.append(b);ends_bak.append(b_bak)
    start_diffs.append(diff_start)
    end_diffs.append(diff_end)
    full_text = test_df.loc[k,'text']
    text1 = " "+" ".join(full_text.split())
    enc = tokenizer.encode(text1)
    if a>b:
        if a_bak <= b and a > b_bak:
            st = tokenizer.decode(enc.ids[a_bak-2:b-1])
        elif a_bak > b and a <= b_bak:
            st = tokenizer.decode(enc.ids[a-2:b_bak-1])
        elif a_bak <= b_bak:
            st = tokenizer.decode(enc.ids[a_bak-2:b_bak-1])
        else:
            count_abn_2 += 1
            st = full_text           
    else:
        st = tokenizer.decode(enc.ids[a-2:b-1])

    all.append(st)
        
    if a_bak>b_bak:
        st_bak = st
    else:
        st_bak = tokenizer.decode(enc.ids[a_bak-2:b_bak-1])
    
    all_bak.append(st_bak)
    
    m, n = 0.14, 0.14
#     m, n = 0.001, 0.001
    st_int, st_uni = get_intersection_and_union(st, st_bak)
    if is_subset(st_bak, full_text) and is_subset(st, full_text):
        count_nm += 1
        if diff_start < m and diff_end < n:
            count_abn += 1
            st_com = st_uni
        else:
            st_com = st
    elif is_subset(st, full_text):
        count_ori += 1
        st_com = st
    else:
        count_true += 1
        length_true.append(len(st))
        st_com = st
#     st_com = full_text if test_df.loc[k,'sentiment'] == 'neutral' else st_com
    st_com = post_process(st_com)
    all_com.append(st_com)

In [None]:
# from matplotlib import pyplot as plt
# from collections import Counter
# Counter(length_true).most_common(20)

In [None]:
n_total = test_df.shape[0]
print(f"Uncertain fraction: {100*count_abn/n_total: .2f}%")      
print(f"First Prob fraction: {100*count_ori/n_total: .2f}%")      
print(f"Legal First and Second Pred Fraction: {100*count_nm/n_total: .2f}%")      
print(f"First Prob not in original text fraction: {100*count_true/n_total: .2f}%")

In [None]:
mask_sentiment = test_df.sentiment != 'neutral'
print(np.quantile(np.array(start_diffs)[mask_sentiment], 0.344))
print(np.quantile(np.array(end_diffs)[mask_sentiment], 0.248))

In [None]:
test_df['selected_text'] = all_com
# test_df['selected_text'] = test_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
# test_df['selected_text'] = test_df['selected_text'].apply(lambda x: x.replace('???', '?') if len(x.split())==1 else x)
# test_df['selected_text'] = test_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
# test_df['selected_text'] = test_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
test_df['is_subset'] = test_df.apply(lambda x: is_subset(x.selected_text, x.text), axis=1)
test_df[['textID','selected_text']].to_csv('submission.csv',index=False)

In [None]:
pd.set_option('max_colwidth', 200)
# test_df.query("sentiment != 'neutral' and is_subset == False").sample(25)
test_df.sample(25)

In [None]:
(test_df.is_subset == 1).mean()