In [None]:
import pandas as pd
import numpy as np
import re
import string
import math
from nltk.tokenize import word_tokenize
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version', tf.__version__)

In [None]:
n_splits = 5
EPOCHS = 3
BATCH_SIZE = 32
MAX_LEN = 96
PAD_ID = 1
LABEL_SMOOTHING = 0.1

PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    PATH+'vocab-roberta-base.json',
    PATH+'merges-roberta-base.txt',
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [None]:
def read_train():
    train = pd.read_csv(
        '../input/tweet-sentiment-extraction/train.csv').fillna('')
    train['text'] = train['text'].astype(str)
    train['selected_text'] = train['selected_text'].astype(str)
    return train


def read_test():
    test = pd.read_csv(
        '../input/tweet-sentiment-extraction/test.csv').fillna('')
    test['text'] = test['text'].astype(str)
    return test


def read_submission():
    test = pd.read_csv(
        '../input/tweet-sentiment-extraction/sample_submission.csv').fillna('')
    return test


def jaccard_improve(str1, str2):
    str1 = str1.lower()
    str2 = str2.lower()
    index = str1.find(str2)
    text1 = str1[:index]
    # print(text1)
    text2 = str1[index:].replace(str2, '')
    words1 = text1.split()
    words2 = text2.split()
    # print(words1[-3:])

    if len(words1) > len(words2):
        words1 = words1[-3:]
        mod_text = " ".join(words1)+" " + str2
    else:
        words2 = words2[0:2]
        mod_text = str2+" "+" ".join(words2)
    return mod_text


def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    l = tf.shape(y_pred)[1]
    y_true = y_true[:, :l]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
                                                    from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss


def scheduler(epoch):
    return 3e-5 * 0.2**epoch


def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(
        PATH+'pretrained-roberta-base.h5', config=config)

    x = bert_model(ids, attention_mask=att, token_type_ids=tok)

    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    x2 = tf.keras.layers.Dropout(0.1)(x[0])
    x2 = tf.keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1, x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss=loss_fn, optimizer=optimizer)

    return model


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    if (len(a) == 0) & (len(b) == 0):
        return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train = read_train()
test = read_test()
submission_df = read_submission()

In [None]:
train_df = train.loc[train.sentiment !=
                     "neutral"].reset_index(drop=True, inplace=False)
test_df = test.loc[test.sentiment != "neutral"].reset_index(
    drop=True, inplace=False)
print(f"non neutral train data: {train_df.shape}")
print(f"non neutral test data: {test_df.shape}")

In [None]:
ct = train_df.shape[0]
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32')
start_tokens = np.zeros((ct, MAX_LEN), dtype='int32')
end_tokens = np.zeros((ct, MAX_LEN), dtype='int32')

print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)
print(start_tokens.shape)
print(end_tokens.shape)

In [None]:
for k in range(train_df.shape[0]):

    # FIND OVERLAP
    text1 = " "+" ".join(train_df.loc[k, 'text'].split())
    text2 = " ".join(train_df.loc[k, 'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)] = 1
    text1[idx-1] == ' '
    if text1[idx-1] == ' ':
        chars[idx-1] = 1
    enc = tokenizer.encode(text1)

    # ID_OFFSETS
    offsets = []
    idx = 0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx, idx+len(w)))
        idx += len(w)

    # START END TOKENS
    toks = []
    for i, (a, b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm > 0:
            toks.append(i)

    s_tok = sentiment_id[train_df.loc[k, 'sentiment']]
    input_ids[k, :len(enc.ids)+5] = [0] + [s_tok] + [2, 2] + enc.ids + [2]
    attention_mask[k, :len(enc.ids)+5] = 1
    if len(toks) > 0:
        start_tokens[k, toks[0]+1] = 1
        end_tokens[k, toks[-1]+1] = 1

In [None]:
ct = test_df.shape[0]
input_ids_t = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask_t = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids_t = np.zeros((ct, MAX_LEN), dtype='int32')

for k in range(test_df.shape[0]):

    # INPUT_IDS
    text1 = " "+" ".join(test_df.loc[k, 'text'].split())

    enc = tokenizer.encode(text1)
    s_tok = sentiment_id[test_df.loc[k, 'sentiment']]
    input_ids_t[k, :len(enc.ids)+5] = [0] + [s_tok] + [2, 2] + enc.ids + [2]
    attention_mask_t[k, :len(enc.ids)+5] = 1

In [None]:
jac = []
VER = 'v3'
DISPLAY = 1  # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0], MAX_LEN))
oof_end = np.zeros((input_ids.shape[0], MAX_LEN))

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)
for fold, (idxT, idxV) in enumerate(skf.split(input_ids, train_df.sentiment.values)):

    print('#'*25)
    print('### FOLD %i' % (fold+1))
    print('#'*25)

    K.clear_session()
    model = build_model()

    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5' % (VER, fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')

    hist = model.fit([input_ids[idxT, ], attention_mask[idxT, ], token_type_ids[idxT, ]],
                     [start_tokens[idxT, ], end_tokens[idxT, ]],
                     epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=DISPLAY,
                     callbacks=[sv, reduce_lr],
                     validation_data=([input_ids[idxV, ], attention_mask[idxV, ], token_type_ids[idxV, ]],
                                      [start_tokens[idxV, ], end_tokens[idxV, ]]))

    print('Loading model...')
    model.load_weights('%s-roberta-%i.h5' % (VER, fold))

    print('Predicting OOF...')
    oof_start[idxV, ], oof_end[idxV, ] = model.predict(
        [input_ids[idxV, ], attention_mask[idxV, ], token_type_ids[idxV, ]], verbose=DISPLAY)

    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k, ])
        b = np.argmax(oof_end[k, ])
        if a > b:
            # IMPROVE CV/LB with better choice here
            st = train_df.loc[k, 'text']
        else:
            text1 = " "+" ".join(train_df.loc[k, 'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st, train_df.loc[k, 'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard =' % (fold+1), np.mean(all))
    print()


print('>>>> OVERALL 5Fold CV Jaccard =', np.mean(jac))

In [None]:
preds_start = np.zeros((input_ids_t.shape[0], MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0], MAX_LEN))
DISPLAY = 1
for i in range(5):
    print('#'*25)
    print('### MODEL %i' % (i+1))
    print('#'*25)

    K.clear_session()
    model = build_model()
    # model.load_weights('../input/model-v3/v3-roberta-%i.h5'%i)
    model.load_weights('./v3-roberta-%i.h5' % i)

    print('Predicting Test...')
    preds = model.predict([input_ids_t, attention_mask_t,
                           token_type_ids_t], verbose=DISPLAY)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k, ])
    b = np.argmax(preds_end[k, ])
    if a > b:
        st = test_df.loc[k, 'text']
    else:
        text1 = " "+" ".join(test_df.loc[k, 'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [None]:
test["selected_text"] = ""  # make a column for predictions

# make predictions for neutral sentiment
test.loc[test.sentiment == "neutral",
         "selected_text"] = test.loc[test.sentiment == "neutral", "text"]

# make predictions for neutral sentiment
test.loc[test.sentiment != "neutral", "selected_text"] = all

In [None]:
test.shape, test_df.shape, submission_df.shape

In [None]:
#test_df['selected_text'] = all
test[['textID', 'selected_text']].to_csv('submission.csv', index=False)

In [None]:
test.head()